diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 6bf9008c3d677..ccb4a70e4dc23 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14815,6 +14815,13 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, DL, Level)) return Res; + if (N0.getOpcode() == ISD::FREEZE && N0.hasOneUse() && !VT.isVector()) { + SDValue Res = + DAG.getFreeze(DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0))); + return DAG.getNode(ISD::AssertSext, DL, VT, Res, + DAG.getValueType(N0.getOperand(0).getValueType())); + } + return SDValue(); } @@ -15194,6 +15201,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { return SDValue(CSENode, 0); } + if (N0.getOpcode() == ISD::FREEZE && N0.hasOneUse() && !VT.isVector()) { + SDValue Res = + DAG.getFreeze(DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0))); + return DAG.getNode(ISD::AssertZext, DL, VT, Res, + DAG.getValueType(N0.getOperand(0).getValueType())); + } + return SDValue(); } @@ -15362,6 +15376,10 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, DL, Level)) return Res; + if (N0.getOpcode() == ISD::FREEZE && N0.hasOneUse()) + return DAG.getFreeze( + DAG.getNode(ISD::ANY_EXTEND, DL, VT, N0.getOperand(0))); + return SDValue(); } @@ -16911,6 +16929,11 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { return LegalShuffle; } + if (N0.getOpcode() == ISD::FREEZE && N0.hasOneUse()) { + SDLoc DL(N); + return DAG.getFreeze(DAG.getNode(ISD::BITCAST, DL, VT, N0.getOperand(0))); + } + return SDValue(); } @@ -16943,6 +16966,11 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { // example https://reviews.llvm.org/D136529#4120959. if (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL) return SDValue(); + // Avoid folding extensions and bitcasts. Each of these operations handles + // FREEZE in their own respective visitors. + if (N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND || + N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::BITCAST) + return SDValue(); // Fold freeze(op(x, ...)) -> op(freeze(x), ...). // Try to push freeze through instructions that propagate but don't produce diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b5f8ee50cba3d..d5c4235d2c5a0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3448,6 +3448,12 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8) return false; + // If we have a large vector type (even if illegal), don't bitcast to large + // (illegal) scalar types. Better to load fewer vectors and extract. + if (LoadVT.isVector() && !BitcastVT.isVector() && LoadVT.isInteger() && + BitcastVT.isInteger() && (LoadVT.getSizeInBits() % 128) == 0) + return false; + // If both types are legal vectors, it's always ok to convert them. if (LoadVT.isVector() && BitcastVT.isVector() && isTypeLegal(LoadVT) && isTypeLegal(BitcastVT)) diff --git a/llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll b/llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll new file mode 100644 index 0000000000000..361005dfb8664 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll @@ -0,0 +1,118 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64 | FileCheck %s + +define double @test_bitcast_freeze_load(ptr %p) { +; CHECK-LABEL: test_bitcast_freeze_load: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ret + %v = load <2 x float>, ptr %p + %f = freeze <2 x float> %v + %b = bitcast <2 x float> %f to double + ret double %b +} + +define i32 @test_sext_freeze_load_i8(ptr %p) { +; CHECK-LABEL: test_sext_freeze_load_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrsb w0, [x0] +; CHECK-NEXT: ret + %v = load i8, ptr %p + %f = freeze i8 %v + %e = sext i8 %f to i32 + ret i32 %e +} + +define i64 @test_sext_freeze_load_i32(ptr %p) { +; CHECK-LABEL: test_sext_freeze_load_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrsw x0, [x0] +; CHECK-NEXT: ret + %v = load i32, ptr %p + %f = freeze i32 %v + %e = sext i32 %f to i64 + ret i64 %e +} + +define i64 @test_sext_freeze_load_i16(ptr %p) { +; CHECK-LABEL: test_sext_freeze_load_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrsh x0, [x0] +; CHECK-NEXT: ret + %v = load i16, ptr %p + %f = freeze i16 %v + %e = sext i16 %f to i64 + ret i64 %e +} + +define i32 @test_zext_freeze_load_i8(ptr %p) { +; CHECK-LABEL: test_zext_freeze_load_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w0, [x0] +; CHECK-NEXT: ret + %v = load i8, ptr %p + %f = freeze i8 %v + %e = zext i8 %f to i32 + ret i32 %e +} + +define i64 @test_zext_freeze_load_i32(ptr %p) { +; CHECK-LABEL: test_zext_freeze_load_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w0, [x0] +; CHECK-NEXT: ret + %v = load i32, ptr %p + %f = freeze i32 %v + %e = zext i32 %f to i64 + ret i64 %e +} + +define i64 @test_zext_freeze_load_i16(ptr %p) { +; CHECK-LABEL: test_zext_freeze_load_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrh w0, [x0] +; CHECK-NEXT: ret + %v = load i16, ptr %p + %f = freeze i16 %v + %e = zext i16 %f to i64 + ret i64 %e +} + +define i32 @test_sext_freeze_load_multiuse(ptr %p) { +; CHECK-LABEL: test_sext_freeze_load_multiuse: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: sxtb w9, w8 +; CHECK-NEXT: add w0, w9, w8, uxtb +; CHECK-NEXT: ret + %v = load i8, ptr %p + %f = freeze i8 %v + %e = sext i8 %f to i32 + %z = zext i8 %f to i32 + %r = add i32 %e, %z + ret i32 %r +} + +define <4 x i32> @test_sext_freeze_load_v4i16(ptr %p) { +; CHECK-LABEL: test_sext_freeze_load_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret + %v = load <4 x i16>, ptr %p + %f = freeze <4 x i16> %v + %e = sext <4 x i16> %f to <4 x i32> + ret <4 x i32> %e +} + +define <4 x i32> @test_zext_freeze_load_v4i16(ptr %p) { +; CHECK-LABEL: test_zext_freeze_load_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret + %v = load <4 x i16>, ptr %p + %f = freeze <4 x i16> %v + %e = zext <4 x i16> %f to <4 x i32> + ret <4 x i32> %e +} diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll index fb909fec90434..5920de998977a 100644 --- a/llvm/test/CodeGen/AArch64/freeze.ll +++ b/llvm/test/CodeGen/AArch64/freeze.ll @@ -376,10 +376,14 @@ define i32 @freeze_anonstruct() { } define i32 @freeze_anonstruct2() { -; CHECK-LABEL: freeze_anonstruct2: -; CHECK: // %bb.0: -; CHECK-NEXT: add w0, w8, w8, uxth -; CHECK-NEXT: ret +; CHECK-SD-LABEL: freeze_anonstruct2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: freeze_anonstruct2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add w0, w8, w8, uxth +; CHECK-GI-NEXT: ret %y1 = freeze {i32, i16} undef %v1 = extractvalue {i32, i16} %y1, 0 %v2 = extractvalue {i32, i16} %y1, 1 diff --git a/llvm/test/CodeGen/AArch64/pr66603.ll b/llvm/test/CodeGen/AArch64/pr66603.ll index 2373b722fa04b..c265a9d5606f3 100644 --- a/llvm/test/CodeGen/AArch64/pr66603.ll +++ b/llvm/test/CodeGen/AArch64/pr66603.ll @@ -5,8 +5,7 @@ define i32 @PR66603(double %x) nounwind { ; CHECK-LABEL: PR66603: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs w8, d0 -; CHECK-NEXT: sxtb w0, w8 +; CHECK-NEXT: fcvtzs w0, d0 ; CHECK-NEXT: ret %as_i8 = fptosi double %x to i8 %frozen_i8 = freeze i8 %as_i8 diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll index 55c343164a1b8..78f5843442422 100644 --- a/llvm/test/CodeGen/AArch64/vector-compress.ll +++ b/llvm/test/CodeGen/AArch64/vector-compress.ll @@ -12,15 +12,16 @@ define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask) { ; CHECK-NEXT: shl.4s v1, v1, #31 ; CHECK-NEXT: cmlt.4s v1, v1, #0 ; CHECK-NEXT: mov.s w9, v1[1] -; CHECK-NEXT: fmov w11, s1 ; CHECK-NEXT: mov.s w10, v1[2] -; CHECK-NEXT: and x12, x11, #0x1 +; CHECK-NEXT: fmov w11, s1 ; CHECK-NEXT: bfi x8, x11, #2, #1 -; CHECK-NEXT: mov x11, sp +; CHECK-NEXT: and x11, x11, #0x1 ; CHECK-NEXT: and x9, x9, #0x1 -; CHECK-NEXT: add x9, x12, x9 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: add x9, x11, x9 +; CHECK-NEXT: mov x11, sp ; CHECK-NEXT: st1.s { v0 }[1], [x8] -; CHECK-NEXT: sub w10, w9, w10 +; CHECK-NEXT: add w10, w9, w10 ; CHECK-NEXT: orr x9, x11, x9, lsl #2 ; CHECK-NEXT: bfi x11, x10, #2, #2 ; CHECK-NEXT: st1.s { v0 }[2], [x9] @@ -420,15 +421,16 @@ define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) { ; CHECK-NEXT: shl.4s v1, v1, #31 ; CHECK-NEXT: cmlt.4s v1, v1, #0 ; CHECK-NEXT: mov.s w8, v1[1] -; CHECK-NEXT: fmov w10, s1 ; CHECK-NEXT: mov.s w9, v1[2] -; CHECK-NEXT: and x12, x10, #0x1 +; CHECK-NEXT: fmov w10, s1 ; CHECK-NEXT: bfi x11, x10, #2, #1 -; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: and x10, x10, #0x1 ; CHECK-NEXT: and x8, x8, #0x1 -; CHECK-NEXT: add x8, x12, x8 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: add x8, x10, x8 +; CHECK-NEXT: mov x10, sp ; CHECK-NEXT: st1.s { v0 }[1], [x11] -; CHECK-NEXT: sub w9, w8, w9 +; CHECK-NEXT: add w9, w8, w9 ; CHECK-NEXT: orr x8, x10, x8, lsl #2 ; CHECK-NEXT: bfi x10, x9, #2, #2 ; CHECK-NEXT: st1.s { v0 }[2], [x8] diff --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll index 4f2b9c5a62669..c61c59068a319 100644 --- a/llvm/test/CodeGen/AArch64/vselect-ext.ll +++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll @@ -594,10 +594,10 @@ define void @extension_in_loop_v16i8_to_v16i32(ptr %src, ptr %dst) { ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #128 ; CHECK-NEXT: cmge.16b v5, v4, #0 -; CHECK-NEXT: tbl.16b v7, { v4 }, v0 -; CHECK-NEXT: tbl.16b v16, { v4 }, v1 -; CHECK-NEXT: tbl.16b v18, { v4 }, v2 -; CHECK-NEXT: tbl.16b v4, { v4 }, v3 +; CHECK-NEXT: tbl.16b v7, { v4 }, v3 +; CHECK-NEXT: tbl.16b v16, { v4 }, v2 +; CHECK-NEXT: tbl.16b v18, { v4 }, v1 +; CHECK-NEXT: tbl.16b v4, { v4 }, v0 ; CHECK-NEXT: sshll2.8h v6, v5, #0 ; CHECK-NEXT: sshll.8h v5, v5, #0 ; CHECK-NEXT: sshll2.4s v17, v6, #0 @@ -664,10 +664,10 @@ define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(ptr %src, ptr %dst) { ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #128 ; CHECK-NEXT: cmge.16b v5, v4, #0 -; CHECK-NEXT: tbl.16b v7, { v4 }, v0 -; CHECK-NEXT: tbl.16b v16, { v4 }, v1 -; CHECK-NEXT: tbl.16b v18, { v4 }, v2 -; CHECK-NEXT: tbl.16b v4, { v4 }, v3 +; CHECK-NEXT: tbl.16b v7, { v4 }, v3 +; CHECK-NEXT: tbl.16b v16, { v4 }, v2 +; CHECK-NEXT: tbl.16b v18, { v4 }, v1 +; CHECK-NEXT: tbl.16b v4, { v4 }, v0 ; CHECK-NEXT: sshll2.8h v6, v5, #0 ; CHECK-NEXT: sshll.8h v5, v5, #0 ; CHECK-NEXT: sshll2.4s v17, v6, #0 @@ -735,10 +735,10 @@ define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(ptr %src, ptr %dst) { ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #128 ; CHECK-NEXT: cmge.16b v5, v4, #0 -; CHECK-NEXT: tbl.16b v7, { v4 }, v0 -; CHECK-NEXT: tbl.16b v16, { v4 }, v1 -; CHECK-NEXT: tbl.16b v18, { v4 }, v2 -; CHECK-NEXT: tbl.16b v4, { v4 }, v3 +; CHECK-NEXT: tbl.16b v7, { v4 }, v3 +; CHECK-NEXT: tbl.16b v16, { v4 }, v2 +; CHECK-NEXT: tbl.16b v18, { v4 }, v1 +; CHECK-NEXT: tbl.16b v4, { v4 }, v0 ; CHECK-NEXT: sshll2.8h v6, v5, #0 ; CHECK-NEXT: sshll.8h v5, v5, #0 ; CHECK-NEXT: sshll2.4s v17, v6, #0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 948811ea45f77..eacd960153c29 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -7769,7 +7769,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: sdiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -7938,7 +7938,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: sdiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 ; GFX9-NEXT: s_ashr_i32 s6, s1, 31 @@ -9037,7 +9037,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: srem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 @@ -9208,7 +9208,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: srem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 ; GFX9-NEXT: s_ashr_i32 s2, s1, 31 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 23c5f4f5506f3..d4b3f5c303467 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -11184,19 +11184,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s10 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 -; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] @@ -11241,19 +11241,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7 ; GFX8_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5] -; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 -; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s10 +; GFX8_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 @@ -11297,19 +11297,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7 ; GFX9_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5] -; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 -; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s10 +; GFX9_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 @@ -13010,19 +13010,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s10 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 -; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] @@ -13067,19 +13067,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7 ; GFX8_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] -; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 -; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s10 +; GFX8_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 @@ -13123,19 +13123,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7 ; GFX9_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] -; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 -; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s10 +; GFX9_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 @@ -14831,19 +14831,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s10 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 -; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] @@ -14887,19 +14887,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7 ; GFX8_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] -; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 -; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s10 +; GFX8_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 @@ -14942,19 +14942,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7 ; GFX9_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] -; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 -; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s10 +; GFX9_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 @@ -16645,19 +16645,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s10 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 -; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] @@ -16701,19 +16701,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7 ; GFX8_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5] -; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 -; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s10 +; GFX8_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 @@ -16756,19 +16756,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7 ; GFX9_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5] -; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 -; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s10 +; GFX9_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 0fc54aeaef77b..c187aac4fc4a2 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -6,77 +6,77 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-LABEL: v_sdiv_v2i128_vv: ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 -; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 0, v0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3 ; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11 -; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f -; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc +; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v26, v24 ; SDAG-NEXT: v_mov_b32_e32 v27, v25 -; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v2, vcc -; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v21, v1, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, v0, v16, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, v19, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v1, v20 -; SDAG-NEXT: v_ffbh_u32_e32 v2, v21 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v3, v0, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v0, v20, v16 -; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0, v8 -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v1 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v16 +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, 0, v2, vcc +; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v3, vcc +; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e32 v17, v3, v17, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v16, v2, v16, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v21, v1, v20, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v20, v0, v18, vcc +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, 0, v8 ; SDAG-NEXT: v_or_b32_e32 v1, v21, v17 -; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc -; SDAG-NEXT: v_min_u32_e32 v2, v19, v2 -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v22 +; SDAG-NEXT: v_or_b32_e32 v0, v20, v16 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v9, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v18, v16 ; SDAG-NEXT: v_ffbh_u32_e32 v22, v17 +; SDAG-NEXT: v_ffbh_u32_e32 v23, v20 +; SDAG-NEXT: v_ffbh_u32_e32 v28, v21 ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[6:7] ; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[6:7] -; SDAG-NEXT: v_min_u32_e32 v1, v19, v22 -; SDAG-NEXT: v_add_i32_e64 v2, s[8:9], 64, v2 -; SDAG-NEXT: v_addc_u32_e64 v3, s[8:9], 0, 0, s[8:9] -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[6:7] -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v10, v2, v1, vcc -; SDAG-NEXT: v_ffbh_u32_e32 v3, v29 -; SDAG-NEXT: v_ffbh_u32_e32 v19, v28 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v8, s[6:7] -; SDAG-NEXT: v_or_b32_e32 v2, v29, v0 -; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v3 -; SDAG-NEXT: v_ffbh_u32_e32 v11, v0 +; SDAG-NEXT: v_add_i32_e64 v1, s[6:7], 32, v18 +; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v23 +; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v11, vcc +; SDAG-NEXT: v_min_u32_e32 v22, v1, v22 +; SDAG-NEXT: v_min_u32_e32 v18, v18, v28 +; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e32 v1, v11, v23, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v28, v9, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v29, v8, v2, vcc +; SDAG-NEXT: v_add_i32_e32 v8, vcc, 64, v18 +; SDAG-NEXT: v_addc_u32_e64 v9, s[6:7], 0, 0, vcc ; SDAG-NEXT: v_or_b32_e32 v3, v28, v1 -; SDAG-NEXT: v_min_u32_e32 v8, v8, v19 -; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11 -; SDAG-NEXT: v_ffbh_u32_e32 v19, v1 +; SDAG-NEXT: v_or_b32_e32 v2, v29, v0 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v22, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v10, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v11, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v18, v29 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v28 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_min_u32_e32 v2, v11, v19 -; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 64, v8 -; SDAG-NEXT: v_addc_u32_e64 v8, s[6:7], 0, 0, s[6:7] -; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v11, v8, 0, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[6:7] +; SDAG-NEXT: v_add_i32_e64 v2, s[6:7], 32, v10 +; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 32, v18 +; SDAG-NEXT: v_min_u32_e32 v2, v2, v11 +; SDAG-NEXT: v_min_u32_e32 v3, v3, v22 ; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] -; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v2, v10 -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v11, v9, vcc +; SDAG-NEXT: v_add_i32_e32 v3, vcc, 64, v3 +; SDAG-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v2, v8 +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v10, v9, vcc ; SDAG-NEXT: v_xor_b32_e32 v2, 0x7f, v8 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v18, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9] -; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v18, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v19, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[8:9] +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v19, vcc ; SDAG-NEXT: v_or_b32_e32 v2, v2, v10 ; SDAG-NEXT: v_or_b32_e32 v3, v9, v11 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v19, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v19, v18, s[4:5] ; SDAG-NEXT: v_and_b32_e32 v2, 1, v2 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] @@ -1564,67 +1564,67 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3 -; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v29, v28 ; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc -; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v17, v1, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v3, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v18, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v18, v16 -; SDAG-NEXT: v_ffbh_u32_e32 v20, v17 +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v3, vcc ; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0, v8 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v2, v16, v0 -; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v0 -; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc -; SDAG-NEXT: v_or_b32_e32 v3, v17, v1 -; SDAG-NEXT: v_min_u32_e32 v18, v18, v20 -; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], 32, v22 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v1 -; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v21, s[4:5] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3] -; SDAG-NEXT: v_min_u32_e32 v3, v20, v22 -; SDAG-NEXT: v_add_i32_e64 v8, s[8:9], 64, v18 -; SDAG-NEXT: v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v20, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v9, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v16, v0, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v3 +; SDAG-NEXT: v_ffbh_u32_e32 v23, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v24, v1 +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, 0, v10, vcc +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[16:17] +; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20 +; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v23 ; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v9, s[4:5] -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_min_u32_e32 v16, v16, v22 +; SDAG-NEXT: v_min_u32_e32 v17, v17, v24 +; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e32 v11, v11, v20, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v10, v10, v25, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v30, v9, v18, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v31, v8, v21, vcc +; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17 +; SDAG-NEXT: v_addc_u32_e64 v18, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_or_b32_e32 v9, v30, v11 +; SDAG-NEXT: v_or_b32_e32 v8, v31, v10 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v10, v8, v3, vcc -; SDAG-NEXT: v_ffbh_u32_e32 v9, v31 -; SDAG-NEXT: v_ffbh_u32_e32 v21, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v20, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v8, v31, v2 -; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v9 -; SDAG-NEXT: v_ffbh_u32_e32 v20, v2 -; SDAG-NEXT: v_or_b32_e32 v9, v30, v3 -; SDAG-NEXT: v_min_u32_e32 v11, v11, v21 -; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v20 -; SDAG-NEXT: v_ffbh_u32_e32 v21, v3 +; SDAG-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v17, v10 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v11 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v31 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v30 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_min_u32_e32 v8, v20, v21 -; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v11 -; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[4:5] -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v8, v10 -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc -; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v10 +; SDAG-NEXT: v_add_i32_e64 v8, s[6:7], 32, v17 +; SDAG-NEXT: v_add_i32_e64 v9, s[6:7], 32, v21 +; SDAG-NEXT: v_min_u32_e32 v8, v8, v20 +; SDAG-NEXT: v_min_u32_e32 v9, v9, v22 +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; SDAG-NEXT: v_add_i32_e32 v9, vcc, 64, v9 +; SDAG-NEXT: v_addc_u32_e64 v17, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v8, v16 +; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v17, v18, vcc +; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v16 ; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v19, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v19, vcc ; SDAG-NEXT: v_or_b32_e32 v8, v8, v18 -; SDAG-NEXT: v_or_b32_e32 v9, v11, v19 +; SDAG-NEXT: v_or_b32_e32 v9, v17, v19 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] @@ -1633,71 +1633,71 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v34, v1, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v34, v3, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v27, v17, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v32, v2, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v27, v1, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc -; SDAG-NEXT: v_cndmask_b32_e64 v33, v16, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v33, v0, 0, s[4:5] ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB2_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v10 -; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v10 +; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v16 +; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v16 ; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: v_mov_b32_e32 v9, 0 -; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc -; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v20 +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v17, vcc +; SDAG-NEXT: v_lshl_b64 v[20:21], v[0:1], v20 ; SDAG-NEXT: v_addc_u32_e32 v34, vcc, 0, v18, vcc ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc -; SDAG-NEXT: v_or_b32_e32 v18, v32, v34 -; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v10 -; SDAG-NEXT: v_or_b32_e32 v19, v33, v35 -; SDAG-NEXT: v_lshl_b64 v[10:11], v[0:1], v24 -; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v24 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] -; SDAG-NEXT: v_lshr_b64 v[18:19], v[16:17], v25 -; SDAG-NEXT: v_or_b32_e32 v11, v11, v19 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v18 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; SDAG-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 -; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v17, v32, v34 +; SDAG-NEXT: v_sub_i32_e32 v19, vcc, 0x7f, v16 +; SDAG-NEXT: v_or_b32_e32 v18, v33, v35 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v19 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 64, v19 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[0:1], v19 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[17:18] +; SDAG-NEXT: v_lshr_b64 v[16:17], v[0:1], v16 +; SDAG-NEXT: v_or_b32_e32 v17, v23, v17 +; SDAG-NEXT: v_or_b32_e32 v16, v22, v16 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v19 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v20, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v25, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v24, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB2_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[22:23], v[16:17], v32 +; SDAG-NEXT: v_lshr_b64 v[22:23], v[0:1], v32 ; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 64, v32 ; SDAG-NEXT: v_subrev_i32_e32 v37, vcc, 64, v32 -; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v32 +; SDAG-NEXT: v_lshr_b64 v[24:25], v[2:3], v32 ; SDAG-NEXT: v_add_i32_e32 v36, vcc, -1, v31 ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 ; SDAG-NEXT: v_mov_b32_e32 v9, 0 -; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v8 -; SDAG-NEXT: v_lshr_b64 v[48:49], v[0:1], v37 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[2:3], v8 +; SDAG-NEXT: v_lshr_b64 v[48:49], v[2:3], v37 ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v30, vcc ; SDAG-NEXT: v_or_b32_e32 v8, v23, v27 ; SDAG-NEXT: v_or_b32_e32 v22, v22, v26 -; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v2, vcc +; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v10, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32 ; SDAG-NEXT: v_cndmask_b32_e64 v8, v49, v8, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v22, v48, v22, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v27, 0, v25, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v26, 0, v24, s[4:5] -; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v3, vcc +; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v11, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 -; SDAG-NEXT: v_cndmask_b32_e32 v25, v8, v17, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v24, v22, v16, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v24, v22, v0, vcc ; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: .LBB2_3: ; %udiv-do-while3 @@ -1707,13 +1707,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v48, 31, v25 ; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v11 -; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 ; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 ; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 ; SDAG-NEXT: v_or_b32_e32 v22, v26, v48 ; SDAG-NEXT: v_or_b32_e32 v23, v24, v49 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v8 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v8 ; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v36, v23 ; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v25, vcc ; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v38, v22, vcc @@ -1721,8 +1721,8 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8 ; SDAG-NEXT: v_and_b32_e32 v24, v8, v31 ; SDAG-NEXT: v_and_b32_e32 v26, v8, v30 -; SDAG-NEXT: v_and_b32_e32 v48, v8, v2 -; SDAG-NEXT: v_and_b32_e32 v49, v8, v3 +; SDAG-NEXT: v_and_b32_e32 v48, v8, v10 +; SDAG-NEXT: v_and_b32_e32 v49, v8, v11 ; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 ; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v23, v24 ; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v26, vcc @@ -1735,9 +1735,9 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v22, v32, v34 ; SDAG-NEXT: v_or_b32_e32 v23, v33, v35 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23] -; SDAG-NEXT: v_or_b32_e32 v11, v19, v11 +; SDAG-NEXT: v_or_b32_e32 v17, v19, v17 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v10, v18, v10 +; SDAG-NEXT: v_or_b32_e32 v16, v18, v16 ; SDAG-NEXT: v_mov_b32_e32 v23, v9 ; SDAG-NEXT: v_mov_b32_e32 v22, v8 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] @@ -1746,123 +1746,123 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB2_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v22 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v22 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v34, v19, v11 -; SDAG-NEXT: v_or_b32_e32 v32, v18, v10 +; SDAG-NEXT: v_or_b32_e32 v34, v19, v17 +; SDAG-NEXT: v_or_b32_e32 v32, v18, v16 ; SDAG-NEXT: v_or_b32_e32 v27, v9, v21 ; SDAG-NEXT: v_or_b32_e32 v33, v8, v20 ; SDAG-NEXT: .LBB2_6: ; %Flow16 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v7 ; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f ; SDAG-NEXT: v_mov_b32_e32 v35, v26 ; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v6, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, 0, v6, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v9, v5, v9, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v8, v4, v8, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v5, vcc, 0, v7, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v10, v8 -; SDAG-NEXT: v_ffbh_u32_e32 v11, v9 +; SDAG-NEXT: v_cndmask_b32_e64 v4, v6, v16, s[4:5] +; SDAG-NEXT: v_ffbh_u32_e32 v16, v8 +; SDAG-NEXT: v_ffbh_u32_e32 v18, v9 ; SDAG-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; SDAG-NEXT: v_sub_i32_e32 v19, vcc, 0, v12 ; SDAG-NEXT: v_or_b32_e32 v6, v8, v4 ; SDAG-NEXT: v_ffbh_u32_e32 v20, v4 -; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v10 +; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], 32, v16 ; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v13, vcc ; SDAG-NEXT: v_or_b32_e32 v7, v9, v5 ; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], 32, v20 ; SDAG-NEXT: v_ffbh_u32_e32 v22, v5 -; SDAG-NEXT: v_min_u32_e32 v10, v10, v11 -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v14, vcc +; SDAG-NEXT: v_min_u32_e32 v16, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v14, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15] ; SDAG-NEXT: v_cndmask_b32_e64 v36, v13, v21, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v37, v12, v19, s[4:5] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[6:7] ; SDAG-NEXT: v_min_u32_e32 v7, v20, v22 -; SDAG-NEXT: v_add_i32_e64 v10, s[8:9], 64, v10 -; SDAG-NEXT: v_addc_u32_e64 v12, s[8:9], 0, 0, s[8:9] -; SDAG-NEXT: v_subb_u32_e32 v13, vcc, 0, v15, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v11, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v11, v37 -; SDAG-NEXT: v_ffbh_u32_e32 v14, v36 +; SDAG-NEXT: v_add_i32_e64 v12, s[8:9], 64, v16 +; SDAG-NEXT: v_addc_u32_e64 v13, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, 0, v15, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v18, s[4:5] +; SDAG-NEXT: v_ffbh_u32_e32 v14, v37 +; SDAG-NEXT: v_ffbh_u32_e32 v18, v36 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v19, v10, v7, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v13, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v10, v37, v6 -; SDAG-NEXT: v_ffbh_u32_e32 v13, v6 -; SDAG-NEXT: v_add_i32_e32 v15, vcc, 32, v11 -; SDAG-NEXT: v_or_b32_e32 v11, v36, v7 -; SDAG-NEXT: v_add_i32_e32 v13, vcc, 32, v13 -; SDAG-NEXT: v_ffbh_u32_e32 v20, v7 -; SDAG-NEXT: v_min_u32_e32 v14, v15, v14 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_min_u32_e32 v10, v13, v20 -; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], 64, v14 -; SDAG-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v13, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v20, v12, v7, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v16, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v12, v37, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v15, v6 +; SDAG-NEXT: v_add_i32_e32 v14, vcc, 32, v14 +; SDAG-NEXT: v_or_b32_e32 v13, v36, v7 +; SDAG-NEXT: v_add_i32_e32 v15, vcc, 32, v15 +; SDAG-NEXT: v_ffbh_u32_e32 v16, v7 +; SDAG-NEXT: v_min_u32_e32 v14, v14, v18 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] +; SDAG-NEXT: v_min_u32_e32 v12, v15, v16 +; SDAG-NEXT: v_add_i32_e64 v13, s[4:5], 64, v14 +; SDAG-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, v13, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc -; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v13, v12, vcc -; SDAG-NEXT: v_xor_b32_e32 v12, 0x7f, v10 -; SDAG-NEXT: v_subb_u32_e32 v14, vcc, 0, v18, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v15, vcc, 0, v18, vcc -; SDAG-NEXT: v_or_b32_e32 v12, v12, v14 +; SDAG-NEXT: v_cndmask_b32_e64 v14, v14, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; SDAG-NEXT: v_sub_i32_e32 v12, vcc, v12, v20 +; SDAG-NEXT: v_subb_u32_e32 v13, vcc, v14, v19, vcc +; SDAG-NEXT: v_xor_b32_e32 v14, 0x7f, v12 +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, 0, v17, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[12:13] +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v17, vcc +; SDAG-NEXT: v_or_b32_e32 v14, v14, v16 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v15, v13, v17 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] -; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v13, v11, v15 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; SDAG-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] -; SDAG-NEXT: v_and_b32_e32 v12, 1, v18 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12 +; SDAG-NEXT: v_and_b32_e32 v14, 1, v18 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v13, v9, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, v8, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v15, v9, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v14, v8, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB2_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 -; SDAG-NEXT: v_add_i32_e32 v38, vcc, 1, v10 -; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v10 -; SDAG-NEXT: v_mov_b32_e32 v12, 0 -; SDAG-NEXT: v_mov_b32_e32 v13, 0 -; SDAG-NEXT: v_addc_u32_e32 v39, vcc, 0, v11, vcc +; SDAG-NEXT: v_add_i32_e32 v38, vcc, 1, v12 +; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v12 +; SDAG-NEXT: v_mov_b32_e32 v14, 0 +; SDAG-NEXT: v_mov_b32_e32 v15, 0 +; SDAG-NEXT: v_addc_u32_e32 v39, vcc, 0, v13, vcc ; SDAG-NEXT: v_lshl_b64 v[18:19], v[8:9], v18 -; SDAG-NEXT: v_addc_u32_e32 v48, vcc, 0, v14, vcc -; SDAG-NEXT: v_addc_u32_e32 v49, vcc, 0, v15, vcc -; SDAG-NEXT: v_or_b32_e32 v14, v38, v48 -; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 0x7f, v10 -; SDAG-NEXT: v_or_b32_e32 v15, v39, v49 -; SDAG-NEXT: v_lshl_b64 v[10:11], v[4:5], v22 +; SDAG-NEXT: v_addc_u32_e32 v48, vcc, 0, v16, vcc +; SDAG-NEXT: v_addc_u32_e32 v49, vcc, 0, v17, vcc +; SDAG-NEXT: v_or_b32_e32 v16, v38, v48 +; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 0x7f, v12 +; SDAG-NEXT: v_or_b32_e32 v17, v39, v49 +; SDAG-NEXT: v_lshl_b64 v[12:13], v[4:5], v22 ; SDAG-NEXT: v_sub_i32_e32 v23, vcc, 64, v22 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[8:9], v22 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] -; SDAG-NEXT: v_lshr_b64 v[14:15], v[8:9], v23 -; SDAG-NEXT: v_or_b32_e32 v11, v11, v15 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v14 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_lshr_b64 v[16:17], v[8:9], v23 +; SDAG-NEXT: v_or_b32_e32 v13, v13, v17 +; SDAG-NEXT: v_or_b32_e32 v12, v12, v16 ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v22 -; SDAG-NEXT: v_cndmask_b32_e64 v14, v19, v11, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v10, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v21, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v20, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v19, v13, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v12, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v20, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 -; SDAG-NEXT: v_cndmask_b32_e64 v15, v14, v5, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v14, v18, v4, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v16, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v18, v4, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1870,52 +1870,52 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_cbranch_execz .LBB2_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader ; SDAG-NEXT: v_lshr_b64 v[20:21], v[8:9], v38 -; SDAG-NEXT: v_sub_i32_e32 v12, vcc, 64, v38 +; SDAG-NEXT: v_sub_i32_e32 v14, vcc, 64, v38 ; SDAG-NEXT: v_subrev_i32_e32 v51, vcc, 64, v38 ; SDAG-NEXT: v_lshr_b64 v[22:23], v[4:5], v38 ; SDAG-NEXT: v_add_i32_e32 v50, vcc, -1, v37 ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_mov_b32_e32 v13, 0 -; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v12 +; SDAG-NEXT: v_mov_b32_e32 v15, 0 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v14 ; SDAG-NEXT: v_lshr_b64 v[53:54], v[4:5], v51 ; SDAG-NEXT: v_addc_u32_e32 v51, vcc, -1, v36, vcc -; SDAG-NEXT: v_or_b32_e32 v12, v21, v25 +; SDAG-NEXT: v_or_b32_e32 v14, v21, v25 ; SDAG-NEXT: v_or_b32_e32 v20, v20, v24 ; SDAG-NEXT: v_addc_u32_e32 v52, vcc, -1, v6, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v38 -; SDAG-NEXT: v_cndmask_b32_e64 v12, v54, v12, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v14, v54, v14, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v20, v53, v20, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v25, 0, v23, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v22, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v53, vcc, -1, v7, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v38 -; SDAG-NEXT: v_cndmask_b32_e32 v23, v12, v9, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v23, v14, v9, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v22, v20, v8, vcc ; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: .LBB2_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v12, 31, v23 +; SDAG-NEXT: v_lshrrev_b32_e32 v14, 31, v23 ; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v54, 31, v15 -; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v55, 31, v11 -; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; SDAG-NEXT: v_or_b32_e32 v24, v24, v12 +; SDAG-NEXT: v_lshrrev_b32_e32 v54, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v55, 31, v13 +; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; SDAG-NEXT: v_or_b32_e32 v24, v24, v14 ; SDAG-NEXT: v_or_b32_e32 v22, v22, v54 -; SDAG-NEXT: v_or_b32_e32 v12, v14, v55 -; SDAG-NEXT: v_or_b32_e32 v15, v19, v15 -; SDAG-NEXT: v_or_b32_e32 v11, v21, v11 -; SDAG-NEXT: v_or_b32_e32 v14, v18, v12 -; SDAG-NEXT: v_sub_i32_e32 v12, vcc, v50, v22 -; SDAG-NEXT: v_subb_u32_e32 v12, vcc, v51, v23, vcc -; SDAG-NEXT: v_subb_u32_e32 v12, vcc, v52, v24, vcc -; SDAG-NEXT: v_subb_u32_e32 v12, vcc, v53, v25, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v12 -; SDAG-NEXT: v_and_b32_e32 v12, 1, v21 +; SDAG-NEXT: v_or_b32_e32 v14, v16, v55 +; SDAG-NEXT: v_or_b32_e32 v17, v19, v17 +; SDAG-NEXT: v_or_b32_e32 v13, v21, v13 +; SDAG-NEXT: v_or_b32_e32 v16, v18, v14 +; SDAG-NEXT: v_sub_i32_e32 v14, vcc, v50, v22 +; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v51, v23, vcc +; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v52, v24, vcc +; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v53, v25, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v14 +; SDAG-NEXT: v_and_b32_e32 v14, 1, v21 ; SDAG-NEXT: v_and_b32_e32 v54, v21, v7 ; SDAG-NEXT: v_and_b32_e32 v55, v21, v6 ; SDAG-NEXT: v_and_b32_e32 v40, v21, v36 @@ -1932,80 +1932,80 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v54, v38, v48 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[54:55] ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v10, v20, v10 -; SDAG-NEXT: v_mov_b32_e32 v21, v13 -; SDAG-NEXT: v_mov_b32_e32 v20, v12 +; SDAG-NEXT: v_or_b32_e32 v12, v20, v12 +; SDAG-NEXT: v_mov_b32_e32 v21, v15 +; SDAG-NEXT: v_mov_b32_e32 v20, v14 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB2_9 ; SDAG-NEXT: ; %bb.10: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB2_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v11 -; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; SDAG-NEXT: v_or_b32_e32 v14, v14, v20 -; SDAG-NEXT: v_or_b32_e32 v19, v19, v15 -; SDAG-NEXT: v_or_b32_e32 v13, v13, v11 -; SDAG-NEXT: v_or_b32_e32 v18, v18, v14 -; SDAG-NEXT: v_or_b32_e32 v12, v12, v10 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v13 +; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 +; SDAG-NEXT: v_or_b32_e32 v19, v19, v17 +; SDAG-NEXT: v_or_b32_e32 v15, v15, v13 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v16 +; SDAG-NEXT: v_or_b32_e32 v14, v14, v12 ; SDAG-NEXT: .LBB2_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v14, v33, v3 -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v33, v2, 0 -; SDAG-NEXT: v_mul_lo_u32 v24, v27, v2 +; SDAG-NEXT: v_mul_lo_u32 v13, v33, v11 +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v33, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v10, v27, v10 ; SDAG-NEXT: v_mul_lo_u32 v25, v34, v31 ; SDAG-NEXT: v_mul_lo_u32 v34, v32, v30 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v33, 0 -; SDAG-NEXT: v_mov_b32_e32 v15, 0 -; SDAG-NEXT: v_mul_lo_u32 v38, v12, v7 -; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v6, 0 -; SDAG-NEXT: v_mul_lo_u32 v39, v13, v6 -; SDAG-NEXT: v_mul_lo_u32 v19, v19, v37 -; SDAG-NEXT: v_mul_lo_u32 v48, v18, v36 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v37, v12, 0 -; SDAG-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; SDAG-NEXT: v_mov_b32_e32 v14, v3 -; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[14:15] -; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v2 -; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v21, v38 -; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v24 -; SDAG-NEXT: v_mov_b32_e32 v14, v22 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v27, v[14:15] -; SDAG-NEXT: v_xor_b32_e32 v24, v16, v28 -; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v21, v39 -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v32, v31, v[10:11] -; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], v23, v3 -; SDAG-NEXT: v_addc_u32_e64 v23, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v31, vcc, v17, v2, vcc -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[20:21] -; SDAG-NEXT: v_mov_b32_e32 v14, v7 -; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v36, v12, v[14:15] +; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v31, v33, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mul_lo_u32 v38, v14, v7 +; SDAG-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v14, v6, 0 +; SDAG-NEXT: v_mul_lo_u32 v39, v15, v6 +; SDAG-NEXT: v_mul_lo_u32 v48, v19, v37 +; SDAG-NEXT: v_mul_lo_u32 v49, v18, v36 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v37, v14, 0 +; SDAG-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; SDAG-NEXT: v_mov_b32_e32 v19, v17 +; SDAG-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v30, v33, v[19:20] +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 +; SDAG-NEXT: v_add_i32_e64 v13, s[4:5], v22, v38 +; SDAG-NEXT: v_add_i32_e64 v12, s[4:5], v12, v10 +; SDAG-NEXT: v_mov_b32_e32 v19, v23 +; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v31, v27, v[19:20] +; SDAG-NEXT: v_xor_b32_e32 v23, v0, v28 +; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], v13, v39 +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v32, v31, v[11:12] +; SDAG-NEXT: v_add_i32_e64 v12, s[4:5], v24, v17 +; SDAG-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v1, v16, vcc +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v37, v[21:22] +; SDAG-NEXT: v_mov_b32_e32 v19, v7 +; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v36, v14, v[19:20] ; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], v25, v11 -; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v30, v27, v[22:23] -; SDAG-NEXT: v_xor_b32_e32 v18, v31, v29 -; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v19, v3 -; SDAG-NEXT: v_mov_b32_e32 v14, v16 -; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v37, v13, v[14:15] +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v30, v27, v[12:13] +; SDAG-NEXT: v_xor_b32_e32 v18, v24, v29 +; SDAG-NEXT: v_add_i32_e64 v1, s[4:5], v48, v1 +; SDAG-NEXT: v_mov_b32_e32 v19, v16 +; SDAG-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v37, v15, v[19:20] ; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], v34, v7 -; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v48, v3 -; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v17, v15 -; SDAG-NEXT: v_addc_u32_e64 v16, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 +; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v49, v1 +; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v17, v14 +; SDAG-NEXT: v_addc_u32_e64 v17, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_add_i32_e64 v1, s[4:5], v11, v10 ; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v0, v10, vcc -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v36, v13, v[15:16] -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc -; SDAG-NEXT: v_xor_b32_e32 v7, v0, v28 -; SDAG-NEXT: v_add_i32_e32 v10, vcc, v10, v2 -; SDAG-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc -; SDAG-NEXT: v_xor_b32_e32 v3, v1, v29 -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v24, v28 +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v2, v1, vcc +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v36, v15, v[16:17] +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; SDAG-NEXT: v_xor_b32_e32 v7, v10, v28 +; SDAG-NEXT: v_add_i32_e32 v10, vcc, v1, v0 +; SDAG-NEXT: v_addc_u32_e32 v11, vcc, v2, v19, vcc +; SDAG-NEXT: v_xor_b32_e32 v3, v3, v29 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v23, v28 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v18, v29, vcc ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v7, v28, vcc ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v29, vcc ; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v14, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v13, vcc ; SDAG-NEXT: v_xor_b32_e32 v6, v6, v26 ; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v10, vcc ; SDAG-NEXT: v_xor_b32_e32 v7, v7, v35 diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index a025c36f620c7..7e233e648cdbc 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -211,22 +211,23 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, -1 +; SI-NEXT: v_mov_b32_e32 v1, 0x432fffff ; SI-NEXT: s_brev_b32 s8, -2 -; SI-NEXT: v_mov_b32_e32 v1, 0x43300000 -; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: v_mov_b32_e32 v2, -1 -; SI-NEXT: v_mov_b32_e32 v3, 0x432fffff +; SI-NEXT: v_mov_b32_e32 v3, 0x43300000 +; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v4, s3 +; SI-NEXT: v_bfi_b32 v3, s8, v3, v4 ; SI-NEXT: v_mov_b32_e32 v6, s3 -; SI-NEXT: v_bfi_b32 v1, s8, v1, v6 ; SI-NEXT: v_mov_b32_e32 v7, s2 -; SI-NEXT: v_add_f64 v[4:5], s[2:3], v[0:1] -; SI-NEXT: v_add_f64 v[0:1], v[4:5], -v[0:1] -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; SI-NEXT: v_add_f64 v[4:5], s[2:3], v[2:3] +; SI-NEXT: v_add_f64 v[2:3], v[4:5], -v[2:3] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[0:1] +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -270,19 +271,21 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s9, 0x432fffff ; SI-NEXT: s_brev_b32 s10, -2 ; SI-NEXT: v_mov_b32_e32 v6, 0x43300000 -; SI-NEXT: s_mov_b32 s9, 0x432fffff ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_bfi_b32 v1, s10, v6, v1 ; SI-NEXT: v_mov_b32_e32 v7, s7 -; SI-NEXT: v_bfi_b32 v1, s10, v6, v7 ; SI-NEXT: v_mov_b32_e32 v8, s6 ; SI-NEXT: v_mov_b32_e32 v9, s5 -; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: v_mov_b32_e32 v10, s5 +; SI-NEXT: v_mov_b32_e32 v11, s4 ; SI-NEXT: v_add_f64 v[2:3], s[6:7], v[0:1] ; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1] ; SI-NEXT: v_bfi_b32 v1, s10, v6, v9 @@ -292,8 +295,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[0:1] ; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1] ; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5] -; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -347,26 +350,30 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x11 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s13, 0x432fffff ; SI-NEXT: s_brev_b32 s14, -2 ; SI-NEXT: v_mov_b32_e32 v10, 0x43300000 -; SI-NEXT: s_mov_b32 s13, 0x432fffff ; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: s_mov_b32 s12, s10 ; SI-NEXT: v_mov_b32_e32 v8, s12 ; SI-NEXT: v_mov_b32_e32 v9, s13 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_bfi_b32 v5, s14, v10, v0 ; SI-NEXT: v_mov_b32_e32 v2, s3 -; SI-NEXT: v_bfi_b32 v5, s14, v10, v2 ; SI-NEXT: v_mov_b32_e32 v6, s2 +; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v7, s1 ; SI-NEXT: v_mov_b32_e32 v11, s0 ; SI-NEXT: v_mov_b32_e32 v12, s7 -; SI-NEXT: v_mov_b32_e32 v13, s6 -; SI-NEXT: v_mov_b32_e32 v14, s5 -; SI-NEXT: v_mov_b32_e32 v15, s4 +; SI-NEXT: v_mov_b32_e32 v13, s7 +; SI-NEXT: v_mov_b32_e32 v14, s6 +; SI-NEXT: v_mov_b32_e32 v15, s5 +; SI-NEXT: v_mov_b32_e32 v16, s5 +; SI-NEXT: v_mov_b32_e32 v17, s4 ; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[4:5] ; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] -; SI-NEXT: v_bfi_b32 v5, s14, v10, v7 +; SI-NEXT: v_bfi_b32 v5, s14, v10, v3 ; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[8:9] ; SI-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc @@ -378,15 +385,15 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc ; SI-NEXT: v_add_f64 v[6:7], s[6:7], v[4:5] ; SI-NEXT: v_add_f64 v[6:7], v[6:7], -v[4:5] -; SI-NEXT: v_bfi_b32 v5, s14, v10, v14 +; SI-NEXT: v_bfi_b32 v5, s14, v10, v15 ; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc ; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[4:5] ; SI-NEXT: v_add_f64 v[4:5], v[10:11], -v[4:5] ; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v5, v5, v14, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v4, v15, vcc +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v16, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v17, vcc ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 6f91222b2f396..a37e7dc5e31a9 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -4307,22 +4307,30 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; SI-NEXT: s_nop 1 ; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11] ; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] -; SI-NEXT: v_readfirstlane_b32 s0, v4 -; SI-NEXT: v_readfirstlane_b32 s1, v5 -; SI-NEXT: s_bfe_u32 s2, s1, 0xb0014 -; SI-NEXT: s_add_i32 s8, s2, 0xfffffc01 -; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s8 -; SI-NEXT: s_andn2_b64 s[2:3], s[0:1], s[2:3] -; SI-NEXT: s_and_b32 s9, s1, 0x80000000 +; SI-NEXT: v_readfirstlane_b32 s2, v4 +; SI-NEXT: v_readfirstlane_b32 s3, v5 +; SI-NEXT: s_bfe_u32 s0, s3, 0xb0014 +; SI-NEXT: s_add_i32 s8, s0, 0xfffffc01 +; SI-NEXT: s_mov_b32 s1, 0xfffff +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s8 +; SI-NEXT: v_not_b32_e32 v6, s0 +; SI-NEXT: v_and_b32_e32 v4, v4, v6 +; SI-NEXT: v_not_b32_e32 v6, s1 +; SI-NEXT: v_and_b32_e32 v5, v5, v6 +; SI-NEXT: s_and_b32 s0, s3, 0x80000000 ; SI-NEXT: s_cmp_lt_i32 s8, 0 -; SI-NEXT: s_cselect_b32 s2, 0, s2 -; SI-NEXT: s_cselect_b32 s3, s9, s3 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; SI-NEXT: v_mov_b32_e32 v6, s0 +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc ; SI-NEXT: s_cmp_gt_i32 s8, 51 -; SI-NEXT: s_cselect_b32 s1, s1, s3 -; SI-NEXT: s_cselect_b32 s0, s0, s2 -; SI-NEXT: v_fma_f64 v[0:1], -s[0:1], v[2:3], v[0:1] +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_mov_b32_e32 v6, s3 +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; SI-NEXT: v_mov_b32_e32 v6, s2 +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -4585,22 +4593,30 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: s_nop 1 ; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11] ; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] -; SI-NEXT: v_readfirstlane_b32 s0, v4 -; SI-NEXT: v_readfirstlane_b32 s1, v5 -; SI-NEXT: s_bfe_u32 s2, s1, 0xb0014 -; SI-NEXT: s_add_i32 s8, s2, 0xfffffc01 -; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s8 -; SI-NEXT: s_andn2_b64 s[2:3], s[0:1], s[2:3] -; SI-NEXT: s_and_b32 s9, s1, 0x80000000 +; SI-NEXT: v_readfirstlane_b32 s2, v4 +; SI-NEXT: v_readfirstlane_b32 s3, v5 +; SI-NEXT: s_bfe_u32 s0, s3, 0xb0014 +; SI-NEXT: s_add_i32 s8, s0, 0xfffffc01 +; SI-NEXT: s_mov_b32 s1, 0xfffff +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s8 +; SI-NEXT: v_not_b32_e32 v6, s0 +; SI-NEXT: v_and_b32_e32 v4, v4, v6 +; SI-NEXT: v_not_b32_e32 v6, s1 +; SI-NEXT: v_and_b32_e32 v5, v5, v6 +; SI-NEXT: s_and_b32 s0, s3, 0x80000000 ; SI-NEXT: s_cmp_lt_i32 s8, 0 -; SI-NEXT: s_cselect_b32 s2, 0, s2 -; SI-NEXT: s_cselect_b32 s3, s9, s3 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; SI-NEXT: v_mov_b32_e32 v6, s0 +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc ; SI-NEXT: s_cmp_gt_i32 s8, 51 -; SI-NEXT: s_cselect_b32 s1, s1, s3 -; SI-NEXT: s_cselect_b32 s0, s0, s2 -; SI-NEXT: v_fma_f64 v[0:1], -s[0:1], v[2:3], v[0:1] +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_mov_b32_e32 v6, s3 +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; SI-NEXT: v_mov_b32_e32 v6, s2 +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll index 6ae058b38e74f..c43a9ffa3d57d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -492,21 +492,21 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mul_hi_u32 v1, s1, v0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mul_hi_i32 v1, s1, v0 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mul_hi_u32 v3, s1, v2 ; SI-NEXT: s_mul_i32 s4, s1, s2 -; SI-NEXT: v_mov_b32_e32 v2, s3 -; SI-NEXT: v_mul_hi_u32 v3, s0, v2 -; SI-NEXT: s_mul_i32 s5, s0, s3 ; SI-NEXT: v_mul_hi_u32 v0, s0, v0 -; SI-NEXT: v_mul_hi_i32 v2, s1, v2 +; SI-NEXT: s_mul_i32 s5, s0, s3 +; SI-NEXT: v_mul_hi_u32 v2, s0, v2 ; SI-NEXT: s_mul_i32 s6, s1, s3 ; SI-NEXT: s_mul_i32 s8, s0, s2 -; SI-NEXT: v_readfirstlane_b32 s9, v1 -; SI-NEXT: v_readfirstlane_b32 s10, v3 -; SI-NEXT: v_readfirstlane_b32 s11, v0 -; SI-NEXT: v_readfirstlane_b32 s12, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, s5, v0 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v0 +; SI-NEXT: v_readfirstlane_b32 s11, v2 +; SI-NEXT: v_readfirstlane_b32 s12, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s5, v2 ; SI-NEXT: s_add_u32 s5, s11, s5 ; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v0 ; SI-NEXT: s_addc_u32 s10, 0, s10 @@ -540,31 +540,31 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s7, s0, s3 -; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s5, s0, s3 -; GFX9-NEXT: s_add_u32 s9, s8, s7 +; GFX9-NEXT: s_mul_i32 s8, s0, s3 +; GFX9-NEXT: s_mul_hi_u32 s9, s0, s2 +; GFX9-NEXT: s_mul_hi_u32 s7, s0, s3 +; GFX9-NEXT: s_add_u32 s10, s9, s8 ; GFX9-NEXT: s_mul_i32 s6, s1, s2 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2 -; GFX9-NEXT: s_add_u32 s9, s9, s6 -; GFX9-NEXT: s_mul_hi_i32 s10, s1, s3 -; GFX9-NEXT: s_addc_u32 s4, s5, s4 -; GFX9-NEXT: s_addc_u32 s5, s10, 0 -; GFX9-NEXT: s_mul_i32 s9, s1, s3 -; GFX9-NEXT: s_add_u32 s4, s4, s9 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_sub_u32 s9, s4, s2 -; GFX9-NEXT: s_subb_u32 s10, s5, 0 +; GFX9-NEXT: s_addc_u32 s7, 0, s7 +; GFX9-NEXT: s_mul_hi_u32 s5, s1, s2 +; GFX9-NEXT: s_add_u32 s10, s10, s6 +; GFX9-NEXT: s_mul_hi_i32 s4, s1, s3 +; GFX9-NEXT: s_addc_u32 s5, s7, s5 +; GFX9-NEXT: s_addc_u32 s4, s4, 0 +; GFX9-NEXT: s_mul_i32 s7, s1, s3 +; GFX9-NEXT: s_add_u32 s5, s5, s7 +; GFX9-NEXT: s_addc_u32 s4, 0, s4 +; GFX9-NEXT: s_sub_u32 s7, s5, s2 +; GFX9-NEXT: s_subb_u32 s10, s4, 0 ; GFX9-NEXT: s_cmp_lt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s4, s9, s4 -; GFX9-NEXT: s_cselect_b32 s1, s10, s5 -; GFX9-NEXT: s_sub_u32 s9, s4, s0 +; GFX9-NEXT: s_cselect_b32 s1, s10, s4 +; GFX9-NEXT: s_cselect_b32 s4, s7, s5 +; GFX9-NEXT: s_sub_u32 s7, s4, s0 ; GFX9-NEXT: s_subb_u32 s5, s1, 0 ; GFX9-NEXT: s_cmp_lt_i32 s3, 0 ; GFX9-NEXT: s_cselect_b32 s5, s5, s1 -; GFX9-NEXT: s_cselect_b32 s4, s9, s4 -; GFX9-NEXT: s_add_i32 s1, s8, s7 +; GFX9-NEXT: s_cselect_b32 s4, s7, s4 +; GFX9-NEXT: s_add_i32 s1, s9, s8 ; GFX9-NEXT: s_add_i32 s1, s1, s6 ; GFX9-NEXT: s_ashr_i32 s6, s1, 31 ; GFX9-NEXT: s_mov_b32 s7, s6 @@ -581,33 +581,33 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s7, s0, s3 -; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2 -; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3 -; GFX10-NEXT: s_mul_i32 s6, s1, s2 -; GFX10-NEXT: s_add_u32 s11, s8, s7 -; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2 -; GFX10-NEXT: s_addc_u32 s5, 0, s5 -; GFX10-NEXT: s_mul_hi_i32 s9, s1, s3 -; GFX10-NEXT: s_add_u32 s11, s11, s6 +; GFX10-NEXT: s_mul_hi_u32 s6, s0, s2 +; GFX10-NEXT: s_mul_i32 s9, s0, s3 +; GFX10-NEXT: s_mul_hi_u32 s8, s0, s3 +; GFX10-NEXT: s_mul_i32 s7, s1, s2 +; GFX10-NEXT: s_add_u32 s11, s6, s9 +; GFX10-NEXT: s_mul_hi_u32 s5, s1, s2 +; GFX10-NEXT: s_addc_u32 s8, 0, s8 +; GFX10-NEXT: s_mul_hi_i32 s4, s1, s3 +; GFX10-NEXT: s_add_u32 s11, s11, s7 ; GFX10-NEXT: s_mul_i32 s10, s1, s3 -; GFX10-NEXT: s_addc_u32 s4, s5, s4 -; GFX10-NEXT: s_addc_u32 s5, s9, 0 -; GFX10-NEXT: s_add_u32 s4, s4, s10 -; GFX10-NEXT: s_addc_u32 s5, 0, s5 -; GFX10-NEXT: s_sub_u32 s9, s4, s2 -; GFX10-NEXT: s_subb_u32 s10, s5, 0 +; GFX10-NEXT: s_addc_u32 s5, s8, s5 +; GFX10-NEXT: s_addc_u32 s4, s4, 0 +; GFX10-NEXT: s_add_u32 s5, s5, s10 +; GFX10-NEXT: s_addc_u32 s4, 0, s4 +; GFX10-NEXT: s_sub_u32 s8, s5, s2 +; GFX10-NEXT: s_subb_u32 s10, s4, 0 ; GFX10-NEXT: s_cmp_lt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s1, s9, s4 -; GFX10-NEXT: s_cselect_b32 s4, s10, s5 -; GFX10-NEXT: s_sub_u32 s9, s1, s0 +; GFX10-NEXT: s_cselect_b32 s1, s8, s5 +; GFX10-NEXT: s_cselect_b32 s4, s10, s4 +; GFX10-NEXT: s_sub_u32 s8, s1, s0 ; GFX10-NEXT: s_subb_u32 s5, s4, 0 ; GFX10-NEXT: s_cmp_lt_i32 s3, 0 ; GFX10-NEXT: s_mul_i32 s0, s0, s2 ; GFX10-NEXT: s_cselect_b32 s5, s5, s4 -; GFX10-NEXT: s_cselect_b32 s4, s9, s1 -; GFX10-NEXT: s_add_i32 s1, s8, s7 -; GFX10-NEXT: s_add_i32 s1, s1, s6 +; GFX10-NEXT: s_cselect_b32 s4, s8, s1 +; GFX10-NEXT: s_add_i32 s1, s6, s9 +; GFX10-NEXT: s_add_i32 s1, s1, s7 ; GFX10-NEXT: s_ashr_i32 s6, s1, 31 ; GFX10-NEXT: s_mov_b32 s7, s6 ; GFX10-NEXT: s_cmp_lg_u64 s[4:5], s[6:7] @@ -622,34 +622,34 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s7, s0, s3 -; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2 -; GFX11-NEXT: s_mul_hi_u32 s5, s0, s3 -; GFX11-NEXT: s_mul_i32 s6, s1, s2 -; GFX11-NEXT: s_add_u32 s11, s8, s7 -; GFX11-NEXT: s_mul_hi_u32 s4, s1, s2 -; GFX11-NEXT: s_addc_u32 s5, 0, s5 -; GFX11-NEXT: s_mul_hi_i32 s9, s1, s3 -; GFX11-NEXT: s_add_u32 s11, s11, s6 +; GFX11-NEXT: s_mul_hi_u32 s6, s0, s2 +; GFX11-NEXT: s_mul_i32 s9, s0, s3 +; GFX11-NEXT: s_mul_hi_u32 s8, s0, s3 +; GFX11-NEXT: s_mul_i32 s7, s1, s2 +; GFX11-NEXT: s_add_u32 s11, s6, s9 +; GFX11-NEXT: s_mul_hi_u32 s5, s1, s2 +; GFX11-NEXT: s_addc_u32 s8, 0, s8 +; GFX11-NEXT: s_mul_hi_i32 s4, s1, s3 +; GFX11-NEXT: s_add_u32 s11, s11, s7 ; GFX11-NEXT: s_mul_i32 s10, s1, s3 -; GFX11-NEXT: s_addc_u32 s4, s5, s4 -; GFX11-NEXT: s_addc_u32 s5, s9, 0 -; GFX11-NEXT: s_add_u32 s4, s4, s10 -; GFX11-NEXT: s_addc_u32 s5, 0, s5 -; GFX11-NEXT: s_sub_u32 s9, s4, s2 -; GFX11-NEXT: s_subb_u32 s10, s5, 0 +; GFX11-NEXT: s_addc_u32 s5, s8, s5 +; GFX11-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-NEXT: s_add_u32 s5, s5, s10 +; GFX11-NEXT: s_addc_u32 s4, 0, s4 +; GFX11-NEXT: s_sub_u32 s8, s5, s2 +; GFX11-NEXT: s_subb_u32 s10, s4, 0 ; GFX11-NEXT: s_cmp_lt_i32 s1, 0 -; GFX11-NEXT: s_cselect_b32 s1, s9, s4 -; GFX11-NEXT: s_cselect_b32 s4, s10, s5 -; GFX11-NEXT: s_sub_u32 s9, s1, s0 +; GFX11-NEXT: s_cselect_b32 s1, s8, s5 +; GFX11-NEXT: s_cselect_b32 s4, s10, s4 +; GFX11-NEXT: s_sub_u32 s8, s1, s0 ; GFX11-NEXT: s_subb_u32 s5, s4, 0 ; GFX11-NEXT: s_cmp_lt_i32 s3, 0 ; GFX11-NEXT: s_mul_i32 s0, s0, s2 ; GFX11-NEXT: s_cselect_b32 s5, s5, s4 -; GFX11-NEXT: s_cselect_b32 s4, s9, s1 -; GFX11-NEXT: s_add_i32 s1, s8, s7 +; GFX11-NEXT: s_cselect_b32 s4, s8, s1 +; GFX11-NEXT: s_add_i32 s1, s6, s9 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s1, s1, s6 +; GFX11-NEXT: s_add_i32 s1, s1, s7 ; GFX11-NEXT: s_ashr_i32 s6, s1, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s7, s6 @@ -666,17 +666,17 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mul_hi_u32 s4, s0, s2 ; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3 ; GFX12-NEXT: s_mul_i32 s6, s0, s3 -; GFX12-NEXT: s_mul_hi_u32 s4, s0, s2 -; GFX12-NEXT: s_mul_i32 s10, s1, s2 +; GFX12-NEXT: s_mul_i32 s13, s1, s2 ; GFX12-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7] -; GFX12-NEXT: s_mul_hi_u32 s9, s1, s2 -; GFX12-NEXT: s_mul_hi_i32 s11, s1, s3 -; GFX12-NEXT: s_add_co_u32 s4, s6, s10 -; GFX12-NEXT: s_add_co_ci_u32 s4, s7, s9 +; GFX12-NEXT: s_mul_hi_u32 s12, s1, s2 +; GFX12-NEXT: s_mul_hi_i32 s9, s1, s3 +; GFX12-NEXT: s_add_co_u32 s4, s6, s13 +; GFX12-NEXT: s_add_co_ci_u32 s4, s7, s12 ; GFX12-NEXT: s_mul_i32 s8, s1, s3 -; GFX12-NEXT: s_add_co_ci_u32 s9, s11, 0 +; GFX12-NEXT: s_add_co_ci_u32 s9, s9, 0 ; GFX12-NEXT: s_cmp_lt_i32 s1, 0 ; GFX12-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[8:9] ; GFX12-NEXT: s_mov_b32 s4, s2 diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index 3d9c2a29cb9c1..2292105c14bc5 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -463,41 +463,39 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 { ; SI-LABEL: test_smul24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dword s4, s[4:5], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_bfe_i32 s0, s8, 0x180000 -; SI-NEXT: s_bfe_i32 s1, s2, 0x180000 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: s_mul_i32 s0, s1, s0 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: s_bfe_i32 s5, s6, 0x180000 +; SI-NEXT: s_bfe_i32 s4, s4, 0x180000 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: s_mul_i32 s5, s4, s5 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dword s3, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfe_i32 s2, s2, 0x180000 -; VI-NEXT: s_bfe_i32 s3, s4, 0x180000 +; VI-NEXT: s_bfe_i32 s3, s3, 0x180000 ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0 ; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1] -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_smul24_i33: @@ -576,32 +574,30 @@ entry: define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_smulhi24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dword s7, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s7, v0 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smulhi24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dword s7, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s7, v0 ; VI-NEXT: v_and_b32_e32 v0, 1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_smulhi24_i33: diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll index f497752994852..1d878a02d2525 100644 --- a/llvm/test/CodeGen/AMDGPU/select-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll @@ -846,8 +846,7 @@ define i64 @poison_should_freeze(i1 %cond1, i32 %val, i16 %val2, i64 %a, i64 %b) ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_mov_b32_e32 v7, 0x5040100 -; GCN-NEXT: v_perm_b32 v2, v2, s4, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll index 8d0e00383d692..dcd7ed441fbae 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll @@ -3967,8 +3967,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4071,8 +4071,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4179,8 +4179,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -4287,8 +4287,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[2:3], v[6:7] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4391,8 +4391,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[6:7] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4506,8 +4506,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll index f15ecf014ab0b..515d36f9967a8 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll @@ -3966,8 +3966,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4070,8 +4070,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4178,8 +4178,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -4286,8 +4286,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[2:3], v[6:7] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4390,8 +4390,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[6:7] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4505,8 +4505,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll index e62165cb933c5..fba4bd516183c 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll @@ -3843,8 +3843,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -3947,8 +3947,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4055,8 +4055,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -4163,8 +4163,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[2:3], v[6:7] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4267,8 +4267,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[6:7] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4382,8 +4382,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll index 83ecaaa7e0846..6ffff5968d4e0 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll @@ -3579,8 +3579,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -3683,8 +3683,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -3791,8 +3791,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -3899,8 +3899,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[6:7] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4003,8 +4003,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[6:7] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4118,8 +4118,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/RISCV/pr66603.ll b/llvm/test/CodeGen/RISCV/pr66603.ll index cfe8ceed12582..eb3d1a3b916e2 100644 --- a/llvm/test/CodeGen/RISCV/pr66603.ll +++ b/llvm/test/CodeGen/RISCV/pr66603.ll @@ -7,15 +7,11 @@ define i32 @PR66603(double %x) nounwind { ; RV32-LABEL: PR66603: ; RV32: # %bb.0: ; RV32-NEXT: fcvt.w.d a0, fa0, rtz -; RV32-NEXT: slli a0, a0, 24 -; RV32-NEXT: srai a0, a0, 24 ; RV32-NEXT: ret ; ; RV64-LABEL: PR66603: ; RV64: # %bb.0: ; RV64-NEXT: fcvt.l.d a0, fa0, rtz -; RV64-NEXT: slli a0, a0, 56 -; RV64-NEXT: srai a0, a0, 56 ; RV64-NEXT: ret %as_i8 = fptosi double %x to i8 %frozen_i8 = freeze i8 %as_i8 diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll index 8a6a30318ae58..d18b64f12e527 100644 --- a/llvm/test/CodeGen/SystemZ/pr60413.ll +++ b/llvm/test/CodeGen/SystemZ/pr60413.ll @@ -74,40 +74,8 @@ define dso_local void @m() local_unnamed_addr #1 { ; CHECK-NEXT: rosbg %r2, %r3, 32, 61, 2 ; CHECK-NEXT: rosbg %r2, %r5, 32, 62, 1 ; CHECK-NEXT: or %r2, %r14 -; CHECK-NEXT: vlgvb %r4, %v0, 1 -; CHECK-NEXT: vlgvb %r3, %v0, 0 -; CHECK-NEXT: risbg %r3, %r3, 48, 176, 15 -; CHECK-NEXT: rosbg %r3, %r4, 49, 49, 14 -; CHECK-NEXT: vlgvb %r4, %v0, 2 -; CHECK-NEXT: rosbg %r3, %r4, 50, 50, 13 -; CHECK-NEXT: vlgvb %r4, %v0, 3 -; CHECK-NEXT: rosbg %r3, %r4, 51, 51, 12 -; CHECK-NEXT: vlgvb %r4, %v0, 4 -; CHECK-NEXT: rosbg %r3, %r4, 52, 52, 11 -; CHECK-NEXT: vlgvb %r4, %v0, 5 -; CHECK-NEXT: rosbg %r3, %r4, 53, 53, 10 -; CHECK-NEXT: vlgvb %r4, %v0, 6 -; CHECK-NEXT: rosbg %r3, %r4, 54, 54, 9 -; CHECK-NEXT: vlgvb %r4, %v0, 7 -; CHECK-NEXT: rosbg %r3, %r4, 55, 55, 8 -; CHECK-NEXT: vlgvb %r4, %v0, 8 -; CHECK-NEXT: rosbg %r3, %r4, 56, 56, 7 -; CHECK-NEXT: vlgvb %r4, %v0, 9 -; CHECK-NEXT: rosbg %r3, %r4, 57, 57, 6 -; CHECK-NEXT: vlgvb %r4, %v0, 10 -; CHECK-NEXT: rosbg %r3, %r4, 58, 58, 5 -; CHECK-NEXT: vlgvb %r4, %v0, 11 -; CHECK-NEXT: rosbg %r3, %r4, 59, 59, 4 -; CHECK-NEXT: vlgvb %r4, %v0, 12 -; CHECK-NEXT: rosbg %r3, %r4, 60, 60, 3 -; CHECK-NEXT: vlgvb %r4, %v0, 13 -; CHECK-NEXT: rosbg %r3, %r4, 61, 61, 2 -; CHECK-NEXT: vlgvb %r4, %v0, 14 -; CHECK-NEXT: rosbg %r3, %r4, 62, 62, 1 -; CHECK-NEXT: vlgvb %r4, %v0, 15 -; CHECK-NEXT: rosbg %r3, %r4, 63, 63, 0 ; CHECK-NEXT: xilf %r3, 4294967295 -; CHECK-NEXT: or %r3, %r2 +; CHECK-NEXT: rosbg %r3, %r2, 48, 63, 0 ; CHECK-NEXT: tmll %r3, 65535 ; CHECK-NEXT: ipm %r2 ; CHECK-NEXT: afi %r2, -268435456 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll b/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll index 080c6c1a1efdc..27bc1e76a7ee2 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll @@ -2902,7 +2902,7 @@ define arm_aapcs_vfpcc <8 x half> @faddqr_v8f16_y(<8 x half> %x, half %y, i32 %n ; CHECK-NEXT: vctp.16 r0 ; CHECK-NEXT: vdup.16 q1, r1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vaddt.f16 q1, q0, r1 +; CHECK-NEXT: vaddt.f16 q1, q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -2978,7 +2978,7 @@ define arm_aapcs_vfpcc <8 x half> @fmulqr_v8f16_y(<8 x half> %x, half %y, i32 %n ; CHECK-NEXT: vctp.16 r0 ; CHECK-NEXT: vdup.16 q1, r1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.f16 q1, q0, r1 +; CHECK-NEXT: vmult.f16 q1, q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll index 79849a7153c91..d9b4635042256 100644 --- a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll +++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll @@ -94,8 +94,8 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src, ; ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512: ; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2] ; X86-NEXT: vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08] ; X86-NEXT: vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1] diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll index 0f2c75b15d5b4..01b7618753a23 100644 --- a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll +++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll @@ -147,8 +147,8 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src, ; ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2] ; X86-NEXT: vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08] ; X86-NEXT: vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1] @@ -201,8 +201,8 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8 ; ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128: ; X86: # %bb.0: -; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2] ; X86-NEXT: vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08] ; X86-NEXT: vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1] diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll index 1a712ffac5b7e..c60d9a3ff17d3 100644 --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -6,8 +6,7 @@ define <8 x i16> @zext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone { ; KNL-LABEL: zext_8x8mem_to_8x16: ; KNL: # %bb.0: -; KNL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; KNL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; KNL-NEXT: vpsllw $15, %xmm0, %xmm0 ; KNL-NEXT: vpsraw $15, %xmm0, %xmm0 ; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -22,8 +21,7 @@ define <8 x i16> @zext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone ; ; AVX512DQNOBW-LABEL: zext_8x8mem_to_8x16: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX512DQNOBW-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX512DQNOBW-NEXT: vpsraw $15, %xmm0, %xmm0 ; AVX512DQNOBW-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -37,8 +35,7 @@ define <8 x i16> @zext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone define <8 x i16> @sext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone { ; KNL-LABEL: sext_8x8mem_to_8x16: ; KNL: # %bb.0: -; KNL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; KNL-NEXT: vpmovsxbw %xmm1, %xmm1 +; KNL-NEXT: vpmovsxbw (%rdi), %xmm1 ; KNL-NEXT: vpsllw $15, %xmm0, %xmm0 ; KNL-NEXT: vpsraw $15, %xmm0, %xmm0 ; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -53,8 +50,7 @@ define <8 x i16> @sext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone ; ; AVX512DQNOBW-LABEL: sext_8x8mem_to_8x16: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512DQNOBW-NEXT: vpmovsxbw %xmm1, %xmm1 +; AVX512DQNOBW-NEXT: vpmovsxbw (%rdi), %xmm1 ; AVX512DQNOBW-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX512DQNOBW-NEXT: vpsraw $15, %xmm0, %xmm0 ; AVX512DQNOBW-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -212,10 +208,8 @@ define <32 x i16> @zext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; KNL-NEXT: vmovdqu (%rdi), %ymm2 -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2 -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 @@ -237,10 +231,8 @@ define <32 x i16> @zext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn ; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512DQNOBW-NEXT: vmovdqu (%rdi), %ymm2 -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512DQNOBW-NEXT: vpsllw $15, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vpsraw $15, %ymm0, %ymm0 @@ -261,10 +253,8 @@ define <32 x i16> @sext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; KNL-NEXT: vmovdqu (%rdi), %ymm2 -; KNL-NEXT: vpmovsxbw %xmm2, %ymm3 -; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2 -; KNL-NEXT: vpmovsxbw %xmm2, %ymm2 +; KNL-NEXT: vpmovsxbw 16(%rdi), %ymm2 +; KNL-NEXT: vpmovsxbw (%rdi), %ymm3 ; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 @@ -286,10 +276,8 @@ define <32 x i16> @sext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn ; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512DQNOBW-NEXT: vmovdqu (%rdi), %ymm2 -; AVX512DQNOBW-NEXT: vpmovsxbw %xmm2, %ymm3 -; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQNOBW-NEXT: vpmovsxbw %xmm2, %ymm2 +; AVX512DQNOBW-NEXT: vpmovsxbw 16(%rdi), %ymm2 +; AVX512DQNOBW-NEXT: vpmovsxbw (%rdi), %ymm3 ; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512DQNOBW-NEXT: vpsllw $15, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vpsraw $15, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll index e223765eb887b..7e3a902044615 100644 --- a/llvm/test/CodeGen/X86/freeze-binary.ll +++ b/llvm/test/CodeGen/X86/freeze-binary.ll @@ -864,12 +864,11 @@ define i32 @freeze_ssubo(i32 %a0, i32 %a1, i8 %a2, i8 %a3) nounwind { ; X86-LABEL: freeze_ssubo: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: addb {{[0-9]+}}(%esp), %dl -; X86-NEXT: setb %cl -; X86-NEXT: andl $1, %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: addb {{[0-9]+}}(%esp), %cl +; X86-NEXT: setb %dl +; X86-NEXT: subl %edx, %eax ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; @@ -896,12 +895,11 @@ define i32 @freeze_usubo(i32 %a0, i32 %a1, i8 %a2, i8 %a3) nounwind { ; X86-LABEL: freeze_usubo: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: addb {{[0-9]+}}(%esp), %dl -; X86-NEXT: setb %cl -; X86-NEXT: andl $1, %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: addb {{[0-9]+}}(%esp), %cl +; X86-NEXT: setb %dl +; X86-NEXT: subl %edx, %eax ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/freeze.ll b/llvm/test/CodeGen/X86/freeze.ll index 38e3e23f7caac..a5549be92e793 100644 --- a/llvm/test/CodeGen/X86/freeze.ll +++ b/llvm/test/CodeGen/X86/freeze.ll @@ -96,8 +96,6 @@ define i32 @freeze_anonstruct() { define i32 @freeze_anonstruct2() { ; X86ASM-LABEL: freeze_anonstruct2: ; X86ASM: # %bb.0: -; X86ASM-NEXT: movzwl %ax, %eax -; X86ASM-NEXT: addl %eax, %eax ; X86ASM-NEXT: retq %y1 = freeze {i32, i16} undef %v1 = extractvalue {i32, i16} %y1, 0 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll index a4750b4cd4ad0..b1237b31660c2 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll @@ -863,20 +863,18 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind ; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5 -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512F-NEXT: vpsubb %ymm1, %ymm5, %ymm1 +; AVX512F-NEXT: vpsubb %ymm0, %ymm5, %ymm5 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm4 & (zmm1 ^ zmm0)) +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_reg: @@ -895,20 +893,18 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm5, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm4 & (zmm1 ^ zmm0)) +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_mem_reg: @@ -953,19 +949,17 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5 -; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpsubb %ymm3, %ymm7, %ymm3 -; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5)) +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512F-NEXT: vpsubb %ymm3, %ymm5, %ymm3 +; AVX512F-NEXT: vpsubb %ymm2, %ymm5, %ymm5 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm3 = zmm2 ^ (zmm4 & (zmm3 ^ zmm2)) +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpaddb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; @@ -985,19 +979,17 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm7, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5)) +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm5, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm3 = zmm2 ^ (zmm4 & (zmm3 ^ zmm2)) +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm3, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll index 2f8cd4d41af54..c9ef6b6c4cdb2 100644 --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -4451,8 +4451,8 @@ define i32 @PR39665_c_ray_select(<2 x double> %x, <2 x double> %y) { ; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k0 -; KNL-NEXT: knotw %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: notb %al ; KNL-NEXT: testb $3, %al ; KNL-NEXT: movl $42, %ecx ; KNL-NEXT: movl $99, %eax @@ -4463,8 +4463,8 @@ define i32 @PR39665_c_ray_select(<2 x double> %x, <2 x double> %y) { ; SKX-LABEL: PR39665_c_ray_select: ; SKX: # %bb.0: ; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %k0 -; SKX-NEXT: knotw %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: notb %al ; SKX-NEXT: testb $3, %al ; SKX-NEXT: movl $42, %ecx ; SKX-NEXT: movl $99, %eax diff --git a/llvm/test/CodeGen/X86/pr162812.ll b/llvm/test/CodeGen/X86/pr162812.ll index cec093c3df743..02703b7e32cc6 100644 --- a/llvm/test/CodeGen/X86/pr162812.ll +++ b/llvm/test/CodeGen/X86/pr162812.ll @@ -34,32 +34,47 @@ define <32 x i8> @PR162812(<32 x i8> %a, <32 x i8> %mask) { ; ; SSE42-LABEL: PR162812: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: psrlw $2, %xmm2 -; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [8224,8224,8224,8224,8224,8224,8224,8224] -; SSE42-NEXT: pand %xmm5, %xmm2 -; SSE42-NEXT: paddb %xmm2, %xmm2 -; SSE42-NEXT: paddb %xmm2, %xmm2 +; SSE42-NEXT: movdqa %xmm2, %xmm5 +; SSE42-NEXT: movdqa %xmm0, %xmm2 ; SSE42-NEXT: movdqa %xmm0, %xmm6 -; SSE42-NEXT: paddb %xmm0, %xmm6 -; SSE42-NEXT: movdqa %xmm2, %xmm0 -; SSE42-NEXT: pblendvb %xmm0, %xmm6, %xmm4 -; SSE42-NEXT: psrlw $2, %xmm3 -; SSE42-NEXT: pand %xmm3, %xmm5 +; SSE42-NEXT: psllw $4, %xmm6 +; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE42-NEXT: pand %xmm7, %xmm6 +; SSE42-NEXT: psrlw $2, %xmm5 +; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [8224,8224,8224,8224,8224,8224,8224,8224] +; SSE42-NEXT: pand %xmm4, %xmm5 +; SSE42-NEXT: movdqa %xmm5, %xmm0 +; SSE42-NEXT: pblendvb %xmm0, %xmm6, %xmm2 +; SSE42-NEXT: movdqa %xmm2, %xmm6 +; SSE42-NEXT: paddb %xmm2, %xmm6 ; SSE42-NEXT: paddb %xmm5, %xmm5 ; SSE42-NEXT: paddb %xmm5, %xmm5 -; SSE42-NEXT: movdqa %xmm1, %xmm2 -; SSE42-NEXT: paddb %xmm1, %xmm2 ; SSE42-NEXT: movdqa %xmm5, %xmm0 -; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE42-NEXT: pblendvb %xmm0, %xmm6, %xmm2 +; SSE42-NEXT: movdqa %xmm1, %xmm5 +; SSE42-NEXT: psllw $4, %xmm5 +; SSE42-NEXT: pand %xmm7, %xmm5 +; SSE42-NEXT: psrlw $2, %xmm3 +; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: movdqa %xmm4, %xmm0 +; SSE42-NEXT: pblendvb %xmm0, %xmm5, %xmm1 +; SSE42-NEXT: movdqa %xmm1, %xmm3 +; SSE42-NEXT: paddb %xmm1, %xmm3 +; SSE42-NEXT: paddb %xmm4, %xmm4 +; SSE42-NEXT: paddb %xmm4, %xmm4 ; SSE42-NEXT: movdqa %xmm4, %xmm0 +; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm1 +; SSE42-NEXT: movdqa %xmm2, %xmm0 ; SSE42-NEXT: retq ; ; AVX2-LABEL: PR162812: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 @@ -67,9 +82,12 @@ define <32 x i8> @PR162812(<32 x i8> %a, <32 x i8> %mask) { ; ; AVX512-LABEL: PR162812: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX512-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 ; AVX512-NEXT: vpsrlw $2, %ymm1, %ymm1 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 +; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpaddb %ymm0, %ymm0, %ymm2 ; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll index b8e83da9cf361..ebb5e135eacd0 100644 --- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll @@ -281,7 +281,7 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-AVX2-NEXT: vpsllvd %ymm1, %ymm2, %ymm2 -; X64-AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; X64-AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] ; X64-AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; X64-AVX2-NEXT: vpsrlvd %ymm1, %ymm3, %ymm1 diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll index fce879585289a..ebd57cb941552 100644 --- a/llvm/test/CodeGen/X86/var-permute-128.ll +++ b/llvm/test/CodeGen/X86/var-permute-128.ll @@ -501,39 +501,39 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw ; SSE3-NEXT: pextrw $0, %xmm1, %eax ; SSE3-NEXT: pextrw $1, %xmm1, %ecx ; SSE3-NEXT: pextrw $2, %xmm1, %edx -; SSE3-NEXT: pextrw $3, %xmm1, %edi -; SSE3-NEXT: pextrw $4, %xmm1, %r8d -; SSE3-NEXT: pextrw $5, %xmm1, %r9d -; SSE3-NEXT: pextrw $6, %xmm1, %r10d -; SSE3-NEXT: pextrw $7, %xmm1, %esi +; SSE3-NEXT: pextrw $3, %xmm1, %esi +; SSE3-NEXT: pextrw $4, %xmm1, %edi +; SSE3-NEXT: pextrw $5, %xmm1, %r8d +; SSE3-NEXT: pextrw $6, %xmm1, %r9d +; SSE3-NEXT: pextrw $7, %xmm1, %r10d ; SSE3-NEXT: movdqa %xmm2, -24(%rsp) ; SSE3-NEXT: andl $7, %eax -; SSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax ; SSE3-NEXT: andl $7, %ecx -; SSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx ; SSE3-NEXT: andl $7, %edx -; SSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx +; SSE3-NEXT: andl $7, %esi ; SSE3-NEXT: andl $7, %edi -; SSE3-NEXT: movzwl -24(%rsp,%rdi,2), %edi ; SSE3-NEXT: andl $7, %r8d -; SSE3-NEXT: movzwl -24(%rsp,%r8,2), %r8d ; SSE3-NEXT: andl $7, %r9d -; SSE3-NEXT: movzwl -24(%rsp,%r9,2), %r9d ; SSE3-NEXT: andl $7, %r10d ; SSE3-NEXT: movzwl -24(%rsp,%r10,2), %r10d -; SSE3-NEXT: andl $7, %esi -; SSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi -; SSE3-NEXT: movd %esi, %xmm1 -; SSE3-NEXT: movd %r10d, %xmm2 +; SSE3-NEXT: movd %r10d, %xmm1 +; SSE3-NEXT: movzwl -24(%rsp,%r9,2), %r9d +; SSE3-NEXT: movd %r9d, %xmm2 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE3-NEXT: movd %r9d, %xmm1 -; SSE3-NEXT: movd %r8d, %xmm3 +; SSE3-NEXT: movzwl -24(%rsp,%r8,2), %r8d +; SSE3-NEXT: movd %r8d, %xmm1 +; SSE3-NEXT: movzwl -24(%rsp,%rdi,2), %edi +; SSE3-NEXT: movd %edi, %xmm3 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE3-NEXT: movd %edi, %xmm1 +; SSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi +; SSE3-NEXT: movd %esi, %xmm1 +; SSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx ; SSE3-NEXT: movd %edx, %xmm2 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx ; SSE3-NEXT: movd %ecx, %xmm1 +; SSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax ; SSE3-NEXT: movd %eax, %xmm4 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] @@ -1102,9 +1102,8 @@ define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices) ; SSE3-NEXT: movq %xmm1, %rcx ; SSE3-NEXT: andl $1, %ecx ; SSE3-NEXT: movaps %xmm0, -24(%rsp) -; SSE3-NEXT: movq -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero -; SSE3-NEXT: movq -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero +; SSE3-NEXT: movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1] ; SSE3-NEXT: pandn %xmm0, %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 ; SSE3-NEXT: retq @@ -1127,9 +1126,8 @@ define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices) ; SSSE3-NEXT: movq %xmm1, %rcx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movaps %xmm0, -24(%rsp) -; SSSE3-NEXT: movq -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero -; SSSE3-NEXT: movq -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero +; SSSE3-NEXT: movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1] ; SSSE3-NEXT: pandn %xmm0, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq @@ -1302,16 +1300,16 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n ; SSE3-NEXT: movd %xmm1, %esi ; SSE3-NEXT: movaps %xmm2, -24(%rsp) ; SSE3-NEXT: andl $3, %eax -; SSE3-NEXT: movd -24(%rsp,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero ; SSE3-NEXT: andl $3, %ecx -; SSE3-NEXT: movd -24(%rsp,%rcx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero ; SSE3-NEXT: andl $3, %edx -; SSE3-NEXT: movd -24(%rsp,%rdx,4), %xmm3 # xmm3 = mem[0],zero,zero,zero ; SSE3-NEXT: andl $3, %esi -; SSE3-NEXT: movd -24(%rsp,%rsi,4), %xmm4 # xmm4 = mem[0],zero,zero,zero -; SSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE3-NEXT: movd -24(%rsp,%rsi,4), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSE3-NEXT: movd -24(%rsp,%rdx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero +; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE3-NEXT: movd -24(%rsp,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSE3-NEXT: movd -24(%rsp,%rcx,4), %xmm3 # xmm3 = mem[0],zero,zero,zero +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE3-NEXT: pandn %xmm1, %xmm0 ; SSE3-NEXT: retq ; @@ -1329,8 +1327,9 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSSE3-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: por %xmm2, %xmm1 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: pandn %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: var_shuffle_zero_v4f32: @@ -1341,8 +1340,9 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n ; SSE41-NEXT: por %xmm2, %xmm1 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [67372036,67372036,67372036,67372036] ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 ; SSE41-NEXT: pshufb %xmm1, %xmm0 +; SSE41-NEXT: pandn %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; XOP-LABEL: var_shuffle_zero_v4f32: diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll index 1a6351524ffbd..5af992c2d05dd 100644 --- a/llvm/test/CodeGen/X86/vector-compress.ll +++ b/llvm/test/CodeGen/X86/vector-compress.ll @@ -289,7 +289,8 @@ define <8 x i32> @test_compress_v8i32(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> ; AVX2-NEXT: cmpq $8, %r11 ; AVX2-NEXT: cmovbl (%rsp,%rax,4), %ebx ; AVX2-NEXT: vmovss %xmm0, (%rsp) -; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4) ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) ; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rsi,4) ; AVX2-NEXT: andl $7, %edi @@ -363,7 +364,8 @@ define <8 x float> @test_compress_v8f32(<8 x float> %vec, <8 x i1> %mask, <8 x f ; AVX2-NEXT: vmovss %xmm0, (%rsp) ; AVX2-NEXT: vmovd %xmm3, %eax ; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rcx,4) ; AVX2-NEXT: vpextrd $1, %xmm3, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx @@ -1093,15 +1095,15 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $1, %xmm1, %r13d +; AVX2-NEXT: vpextrb $1, %xmm1, %ebp ; AVX2-NEXT: vmovd %xmm1, %esi ; AVX2-NEXT: movl %esi, %eax ; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: subb %r13b, %al +; AVX2-NEXT: subb %bpl, %al ; AVX2-NEXT: vpextrb $2, %xmm1, %edx ; AVX2-NEXT: subb %dl, %al -; AVX2-NEXT: vpextrb $3, %xmm1, %ebp -; AVX2-NEXT: subb %bpl, %al +; AVX2-NEXT: vpextrb $3, %xmm1, %r13d +; AVX2-NEXT: subb %r13b, %al ; AVX2-NEXT: vpextrb $4, %xmm1, %r12d ; AVX2-NEXT: subb %r12b, %al ; AVX2-NEXT: vpextrb $5, %xmm1, %r15d @@ -1135,17 +1137,17 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> ; AVX2-NEXT: vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: andl $1, %esi ; AVX2-NEXT: vpextrb $1, %xmm0, -40(%rsp,%rsi) -; AVX2-NEXT: andl $1, %r13d -; AVX2-NEXT: addq %rsi, %r13 -; AVX2-NEXT: vpextrb $2, %xmm0, -40(%rsp,%r13) +; AVX2-NEXT: andl $1, %ebp +; AVX2-NEXT: addq %rsi, %rbp +; AVX2-NEXT: vpextrb $2, %xmm0, -40(%rsp,%rbp) ; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %r13, %rdx +; AVX2-NEXT: addq %rbp, %rdx ; AVX2-NEXT: vpextrb $3, %xmm0, -40(%rsp,%rdx) -; AVX2-NEXT: andl $1, %ebp -; AVX2-NEXT: addq %rdx, %rbp -; AVX2-NEXT: vpextrb $4, %xmm0, -40(%rsp,%rbp) +; AVX2-NEXT: andl $1, %r13d +; AVX2-NEXT: addq %rdx, %r13 +; AVX2-NEXT: vpextrb $4, %xmm0, -40(%rsp,%r13) ; AVX2-NEXT: andl $1, %r12d -; AVX2-NEXT: addq %rbp, %r12 +; AVX2-NEXT: addq %r13, %r12 ; AVX2-NEXT: andl $1, %r15d ; AVX2-NEXT: addq %r12, %r15 ; AVX2-NEXT: # kill: def $r12d killed $r12d killed $r12 def $r12 @@ -1693,30 +1695,30 @@ define <16 x i16> @test_compress_v16i16(<16 x i16> %vec, <16 x i1> %mask, <16 x ; AVX2-NEXT: vpextrw $4, %xmm1, %r13d ; AVX2-NEXT: andl $1, %r13d ; AVX2-NEXT: addq %r12, %r13 -; AVX2-NEXT: vpextrw $5, %xmm1, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %r13, %rdx -; AVX2-NEXT: vpextrw $6, %xmm1, %ecx +; AVX2-NEXT: vpextrw $5, %xmm1, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: vpextrw $7, %xmm1, %edi -; AVX2-NEXT: andl $1, %edi -; AVX2-NEXT: addq %rcx, %rdi +; AVX2-NEXT: addq %r13, %rcx +; AVX2-NEXT: vpextrw $6, %xmm1, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: vpextrw $7, %xmm1, %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rax, %rdx ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: cmpq $16, %rdi -; AVX2-NEXT: vpextrw $7, %xmm1, %eax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX2-NEXT: cmovbw (%rsp,%rsi,2), %ax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: cmpq $16, %rdx +; AVX2-NEXT: vpextrw $7, %xmm1, %esi +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AVX2-NEXT: cmovbw (%rsp,%rdi,2), %si +; AVX2-NEXT: movl %esi, %edi ; AVX2-NEXT: vpextrw $0, %xmm0, (%rsp) -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload ; AVX2-NEXT: vpextrw $1, %xmm0, (%rsp,%rsi,2) ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; AVX2-NEXT: vpextrw $2, %xmm0, (%rsp,%rsi,2) ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; AVX2-NEXT: vpextrw $3, %xmm0, (%rsp,%rsi,2) -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: vpextrw $4, %xmm0, (%rsp,%rax,2) +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX2-NEXT: vpextrw $4, %xmm0, (%rsp,%rsi,2) ; AVX2-NEXT: andl $15, %r8d ; AVX2-NEXT: vpextrw $5, %xmm0, (%rsp,%r8,2) ; AVX2-NEXT: andl $15, %r9d @@ -1735,16 +1737,15 @@ define <16 x i16> @test_compress_v16i16(<16 x i16> %vec, <16 x i1> %mask, <16 x ; AVX2-NEXT: vpextrw $4, %xmm1, (%rsp,%r12,2) ; AVX2-NEXT: andl $15, %r13d ; AVX2-NEXT: vpextrw $5, %xmm1, (%rsp,%r13,2) -; AVX2-NEXT: andl $15, %edx -; AVX2-NEXT: vpextrw $6, %xmm1, (%rsp,%rdx,2) ; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrw $7, %xmm1, (%rsp,%rcx,2) -; AVX2-NEXT: cmpq $15, %rdi +; AVX2-NEXT: vpextrw $6, %xmm1, (%rsp,%rcx,2) +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpextrw $7, %xmm1, (%rsp,%rax,2) +; AVX2-NEXT: cmpq $15, %rdx ; AVX2-NEXT: movl $15, %eax -; AVX2-NEXT: cmovbq %rdi, %rax +; AVX2-NEXT: cmovbq %rdx, %rax ; AVX2-NEXT: movl %eax, %eax -; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; AVX2-NEXT: movw %cx, (%rsp,%rax,2) +; AVX2-NEXT: movw %di, (%rsp,%rax,2) ; AVX2-NEXT: vmovaps (%rsp), %ymm0 ; AVX2-NEXT: leaq -40(%rbp), %rsp ; AVX2-NEXT: popq %rbx @@ -1788,135 +1789,141 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx ; AVX2-NEXT: andq $-32, %rsp -; AVX2-NEXT: subq $96, %rsp -; AVX2-NEXT: movl %r9d, %r11d -; AVX2-NEXT: movl %r8d, %r10d -; AVX2-NEXT: movl %ecx, %r9d -; AVX2-NEXT: movl %edx, %r8d +; AVX2-NEXT: subq $160, %rsp +; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 +; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 +; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: # kill: def $edi killed $edi def $rdi -; AVX2-NEXT: movzbl 360(%rbp), %eax -; AVX2-NEXT: movzbl 352(%rbp), %ecx -; AVX2-NEXT: vmovd %ecx, %xmm4 +; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl 360(%rbp), %eax +; AVX2-NEXT: movl 352(%rbp), %r10d +; AVX2-NEXT: vmovd %r10d, %xmm4 ; AVX2-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movzbl 368(%rbp), %eax +; AVX2-NEXT: movl 368(%rbp), %eax ; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movzbl 376(%rbp), %eax +; AVX2-NEXT: movl 376(%rbp), %eax ; AVX2-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movzbl 384(%rbp), %eax +; AVX2-NEXT: movl 384(%rbp), %eax ; AVX2-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movzbl 392(%rbp), %eax +; AVX2-NEXT: movl 392(%rbp), %eax ; AVX2-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movzbl 400(%rbp), %eax +; AVX2-NEXT: movl 400(%rbp), %eax ; AVX2-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movzbl 408(%rbp), %eax +; AVX2-NEXT: movl 408(%rbp), %eax ; AVX2-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movzbl 416(%rbp), %eax +; AVX2-NEXT: movl 416(%rbp), %eax ; AVX2-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movzbl 424(%rbp), %eax +; AVX2-NEXT: movl 424(%rbp), %eax ; AVX2-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movzbl 432(%rbp), %eax +; AVX2-NEXT: movl 432(%rbp), %eax ; AVX2-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movzbl 440(%rbp), %eax +; AVX2-NEXT: movl 440(%rbp), %eax ; AVX2-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movzbl 448(%rbp), %eax +; AVX2-NEXT: movl 448(%rbp), %eax ; AVX2-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movzbl 456(%rbp), %eax +; AVX2-NEXT: movl 456(%rbp), %eax ; AVX2-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movzbl 464(%rbp), %eax +; AVX2-NEXT: movl 464(%rbp), %eax ; AVX2-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movzbl 472(%rbp), %eax +; AVX2-NEXT: movl 472(%rbp), %eax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movzbl 224(%rbp), %eax +; AVX2-NEXT: movl 224(%rbp), %eax ; AVX2-NEXT: vmovd %eax, %xmm5 -; AVX2-NEXT: movzbl 232(%rbp), %eax +; AVX2-NEXT: movl 232(%rbp), %eax ; AVX2-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 240(%rbp), %eax +; AVX2-NEXT: movl 240(%rbp), %eax ; AVX2-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 248(%rbp), %eax +; AVX2-NEXT: movl 248(%rbp), %eax ; AVX2-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 256(%rbp), %eax +; AVX2-NEXT: movl 256(%rbp), %eax ; AVX2-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 264(%rbp), %eax +; AVX2-NEXT: movl 264(%rbp), %eax ; AVX2-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 272(%rbp), %eax +; AVX2-NEXT: movl 272(%rbp), %eax ; AVX2-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 280(%rbp), %eax +; AVX2-NEXT: movl 280(%rbp), %eax ; AVX2-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 288(%rbp), %eax +; AVX2-NEXT: movl 288(%rbp), %eax ; AVX2-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 296(%rbp), %eax +; AVX2-NEXT: movl 296(%rbp), %eax ; AVX2-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 304(%rbp), %eax +; AVX2-NEXT: movl 304(%rbp), %eax ; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 312(%rbp), %eax +; AVX2-NEXT: movl 312(%rbp), %eax ; AVX2-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 320(%rbp), %eax +; AVX2-NEXT: movl 320(%rbp), %eax ; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 328(%rbp), %eax +; AVX2-NEXT: movl 328(%rbp), %eax ; AVX2-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 336(%rbp), %eax +; AVX2-NEXT: movl 336(%rbp), %eax ; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 344(%rbp), %eax +; AVX2-NEXT: movl 344(%rbp), %eax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 ; AVX2-NEXT: vmovd %edi, %xmm5 ; AVX2-NEXT: vpinsrb $1, %esi, %xmm5, %xmm5 ; AVX2-NEXT: vpinsrb $2, %edx, %xmm5, %xmm5 -; AVX2-NEXT: vpinsrb $3, %r9d, %xmm5, %xmm5 -; AVX2-NEXT: vpinsrb $4, %r10d, %xmm5, %xmm5 -; AVX2-NEXT: vpinsrb $5, %r11d, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 16(%rbp), %ebx -; AVX2-NEXT: vpinsrb $6, %ebx, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 24(%rbp), %r14d -; AVX2-NEXT: vpinsrb $7, %r14d, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 32(%rbp), %r15d -; AVX2-NEXT: vpinsrb $8, %r15d, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 40(%rbp), %r12d -; AVX2-NEXT: vpinsrb $9, %r12d, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 48(%rbp), %r13d -; AVX2-NEXT: vpinsrb $10, %r13d, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 56(%rbp), %eax -; AVX2-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 64(%rbp), %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 72(%rbp), %eax -; AVX2-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 80(%rbp), %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 88(%rbp), %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movzbl 96(%rbp), %eax -; AVX2-NEXT: vmovd %eax, %xmm6 -; AVX2-NEXT: movzbl 104(%rbp), %eax +; AVX2-NEXT: vpinsrb $3, %ecx, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $4, %r8d, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $5, %r9d, %xmm5, %xmm5 +; AVX2-NEXT: movl 16(%rbp), %esi +; AVX2-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5 +; AVX2-NEXT: movl 24(%rbp), %edi +; AVX2-NEXT: vpinsrb $7, %edi, %xmm5, %xmm5 +; AVX2-NEXT: movl 32(%rbp), %r8d +; AVX2-NEXT: vpinsrb $8, %r8d, %xmm5, %xmm5 +; AVX2-NEXT: movl 40(%rbp), %r9d +; AVX2-NEXT: vpinsrb $9, %r9d, %xmm5, %xmm5 +; AVX2-NEXT: movl 48(%rbp), %r10d +; AVX2-NEXT: vpinsrb $10, %r10d, %xmm5, %xmm5 +; AVX2-NEXT: movl 56(%rbp), %r11d +; AVX2-NEXT: vpinsrb $11, %r11d, %xmm5, %xmm5 +; AVX2-NEXT: movl 64(%rbp), %ebx +; AVX2-NEXT: vpinsrb $12, %ebx, %xmm5, %xmm5 +; AVX2-NEXT: movl 72(%rbp), %r14d +; AVX2-NEXT: vpinsrb $13, %r14d, %xmm5, %xmm5 +; AVX2-NEXT: movl 80(%rbp), %r15d +; AVX2-NEXT: vpinsrb $14, %r15d, %xmm5, %xmm5 +; AVX2-NEXT: movl 88(%rbp), %r12d +; AVX2-NEXT: vpinsrb $15, %r12d, %xmm5, %xmm5 +; AVX2-NEXT: movl 96(%rbp), %r13d +; AVX2-NEXT: vmovd %r13d, %xmm6 +; AVX2-NEXT: movl 104(%rbp), %eax ; AVX2-NEXT: vpinsrb $1, %eax, %xmm6, %xmm6 -; AVX2-NEXT: movzbl 112(%rbp), %eax +; AVX2-NEXT: movl 112(%rbp), %eax ; AVX2-NEXT: vpinsrb $2, %eax, %xmm6, %xmm6 -; AVX2-NEXT: movzbl 120(%rbp), %eax +; AVX2-NEXT: movl 120(%rbp), %eax ; AVX2-NEXT: vpinsrb $3, %eax, %xmm6, %xmm6 -; AVX2-NEXT: movzbl 128(%rbp), %eax +; AVX2-NEXT: movl 128(%rbp), %eax ; AVX2-NEXT: vpinsrb $4, %eax, %xmm6, %xmm6 -; AVX2-NEXT: movzbl 136(%rbp), %eax +; AVX2-NEXT: movl 136(%rbp), %eax ; AVX2-NEXT: vpinsrb $5, %eax, %xmm6, %xmm6 -; AVX2-NEXT: movzbl 144(%rbp), %eax +; AVX2-NEXT: movl 144(%rbp), %eax ; AVX2-NEXT: vpinsrb $6, %eax, %xmm6, %xmm6 -; AVX2-NEXT: movzbl 152(%rbp), %eax +; AVX2-NEXT: movl 152(%rbp), %eax ; AVX2-NEXT: vpinsrb $7, %eax, %xmm6, %xmm6 -; AVX2-NEXT: movzbl 160(%rbp), %eax +; AVX2-NEXT: movl 160(%rbp), %eax ; AVX2-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6 -; AVX2-NEXT: movzbl 168(%rbp), %eax +; AVX2-NEXT: movl 168(%rbp), %eax ; AVX2-NEXT: vpinsrb $9, %eax, %xmm6, %xmm6 -; AVX2-NEXT: movzbl 176(%rbp), %eax +; AVX2-NEXT: movl 176(%rbp), %eax ; AVX2-NEXT: vpinsrb $10, %eax, %xmm6, %xmm6 -; AVX2-NEXT: movzbl 184(%rbp), %eax +; AVX2-NEXT: movl 184(%rbp), %eax ; AVX2-NEXT: vpinsrb $11, %eax, %xmm6, %xmm6 -; AVX2-NEXT: movzbl 192(%rbp), %eax +; AVX2-NEXT: movl 192(%rbp), %eax ; AVX2-NEXT: vpinsrb $12, %eax, %xmm6, %xmm6 -; AVX2-NEXT: movzbl 200(%rbp), %eax +; AVX2-NEXT: movl 200(%rbp), %eax ; AVX2-NEXT: vpinsrb $13, %eax, %xmm6, %xmm6 -; AVX2-NEXT: movzbl 208(%rbp), %eax +; AVX2-NEXT: movl 208(%rbp), %eax ; AVX2-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6 -; AVX2-NEXT: movzbl 216(%rbp), %eax +; AVX2-NEXT: movl 216(%rbp), %eax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm6, %xmm6 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] @@ -1960,435 +1967,382 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX2-NEXT: vmovaps %ymm2, (%rsp) ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %edx +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp) -; AVX2-NEXT: andl $1, %edi -; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rdi) -; AVX2-NEXT: andl $1, %esi -; AVX2-NEXT: addq %rdi, %rsi -; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rsi) -; AVX2-NEXT: andl $1, %r8d -; AVX2-NEXT: addq %rsi, %r8 -; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%r8) -; AVX2-NEXT: andl $1, %r9d -; AVX2-NEXT: addq %r8, %r9 -; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%r9) -; AVX2-NEXT: andl $1, %r10d -; AVX2-NEXT: addq %r9, %r10 -; AVX2-NEXT: movl %r10d, %eax -; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) -; AVX2-NEXT: andl $1, %r11d -; AVX2-NEXT: addq %r10, %r11 -; AVX2-NEXT: movzbl %bl, %eax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %r11, %rax -; AVX2-NEXT: # kill: def $r11d killed $r11d killed $r11 def $r11 -; AVX2-NEXT: andl $63, %r11d -; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%r11) -; AVX2-NEXT: movzbl %r14b, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl %r15b, %eax +; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl %r12b, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl %r13b, %eax +; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 56(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl 64(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 72(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl 80(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 88(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: addq %rax, %rsi ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl 96(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax) +; AVX2-NEXT: andl $1, %edi +; AVX2-NEXT: addq %rsi, %rdi +; AVX2-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi +; AVX2-NEXT: andl $63, %esi +; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rsi) +; AVX2-NEXT: andl $1, %r8d +; AVX2-NEXT: addq %rdi, %r8 +; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi +; AVX2-NEXT: andl $63, %edi +; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rdi) +; AVX2-NEXT: andl $1, %r9d +; AVX2-NEXT: addq %r8, %r9 +; AVX2-NEXT: # kill: def $r8d killed $r8d killed $r8 def $r8 +; AVX2-NEXT: andl $63, %r8d +; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%r8) +; AVX2-NEXT: andl $1, %r10d +; AVX2-NEXT: addq %r9, %r10 +; AVX2-NEXT: # kill: def $r9d killed $r9d killed $r9 def $r9 +; AVX2-NEXT: andl $63, %r9d +; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%r9) +; AVX2-NEXT: andl $1, %r11d +; AVX2-NEXT: addq %r10, %r11 +; AVX2-NEXT: # kill: def $r10d killed $r10d killed $r10 def $r10 +; AVX2-NEXT: andl $63, %r10d +; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%r10) +; AVX2-NEXT: andl $1, %ebx +; AVX2-NEXT: addq %r11, %rbx +; AVX2-NEXT: # kill: def $r11d killed $r11d killed $r11 def $r11 +; AVX2-NEXT: andl $63, %r11d +; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%r11) +; AVX2-NEXT: andl $1, %r14d +; AVX2-NEXT: addq %rbx, %r14 +; AVX2-NEXT: # kill: def $ebx killed $ebx killed $rbx def $rbx +; AVX2-NEXT: andl $63, %ebx +; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rbx) +; AVX2-NEXT: andl $1, %r15d +; AVX2-NEXT: addq %r14, %r15 +; AVX2-NEXT: # kill: def $r14d killed $r14d killed $r14 def $r14 +; AVX2-NEXT: andl $63, %r14d +; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%r14) +; AVX2-NEXT: andl $1, %r12d +; AVX2-NEXT: addq %r15, %r12 +; AVX2-NEXT: # kill: def $r15d killed $r15d killed $r15 def $r15 +; AVX2-NEXT: andl $63, %r15d +; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%r15) +; AVX2-NEXT: andl $1, %r13d +; AVX2-NEXT: addq %r12, %r13 +; AVX2-NEXT: # kill: def $r12d killed $r12d killed $r12 def $r12 +; AVX2-NEXT: andl $63, %r12d ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 104(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%r12) +; AVX2-NEXT: movl 104(%rbp), %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %r13, %rax +; AVX2-NEXT: # kill: def $r13d killed $r13d killed $r13 def $r13 +; AVX2-NEXT: andl $63, %r13d +; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%r13) +; AVX2-NEXT: movl 112(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rax, %rdx ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl 112(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 120(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 120(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl 128(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 136(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 128(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 136(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl 144(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 152(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 144(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 152(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl 160(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 168(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 160(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 168(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl 176(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 184(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 176(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 184(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl 192(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 200(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 192(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 200(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl 208(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 216(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 208(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 216(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl 224(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $0, %xmm1, (%rsp,%rcx) -; AVX2-NEXT: movzbl 232(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 224(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $0, %xmm1, (%rsp,%rax) +; AVX2-NEXT: movl 232(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $1, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movzbl 240(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $2, %xmm1, (%rsp,%rcx) -; AVX2-NEXT: movzbl 248(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 240(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $2, %xmm1, (%rsp,%rax) +; AVX2-NEXT: movl 248(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $3, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movzbl 256(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $4, %xmm1, (%rsp,%rcx) -; AVX2-NEXT: movzbl 264(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: movl 256(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $4, %xmm1, (%rsp,%rax) +; AVX2-NEXT: movl 264(%rbp), %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $5, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movzbl 272(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $6, %xmm1, (%rsp,%rcx) -; AVX2-NEXT: movzbl 280(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 272(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $6, %xmm1, (%rsp,%rax) +; AVX2-NEXT: movl 280(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $7, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movzbl 288(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $8, %xmm1, (%rsp,%rcx) -; AVX2-NEXT: movzbl 296(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 288(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $8, %xmm1, (%rsp,%rax) +; AVX2-NEXT: movl 296(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $9, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movzbl 304(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $10, %xmm1, (%rsp,%rcx) -; AVX2-NEXT: movzbl 312(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 304(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $10, %xmm1, (%rsp,%rax) +; AVX2-NEXT: movl 312(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $11, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movzbl 320(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $12, %xmm1, (%rsp,%rcx) -; AVX2-NEXT: movzbl 328(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 320(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $12, %xmm1, (%rsp,%rax) +; AVX2-NEXT: movl 328(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $13, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movzbl 336(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $14, %xmm1, (%rsp,%rcx) -; AVX2-NEXT: movzbl 344(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 336(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $14, %xmm1, (%rsp,%rax) +; AVX2-NEXT: movl 344(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $15, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movzbl 352(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: movl 352(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 360(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 360(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl 368(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 376(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 368(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 376(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl 384(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 392(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 384(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 392(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl 400(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 408(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 400(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 408(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl 416(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 424(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 416(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 424(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl 432(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 440(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 432(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 440(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl 448(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 456(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 448(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 456(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movzbl 464(%rbp), %eax -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx) -; AVX2-NEXT: movzbl 472(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 464(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 472(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax) ; AVX2-NEXT: vpextrb $15, %xmm0, %eax ; AVX2-NEXT: cmpq $64, %rcx -; AVX2-NEXT: cmovbl %edx, %eax +; AVX2-NEXT: cmovbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload ; AVX2-NEXT: cmpq $63, %rcx -; AVX2-NEXT: movl $63, %edx -; AVX2-NEXT: cmovbq %rcx, %rdx -; AVX2-NEXT: movb %al, (%rsp,%rdx) +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: movl $63, %ecx +; AVX2-NEXT: cmovbq %rdx, %rcx +; AVX2-NEXT: movb %al, (%rsp,%rcx) ; AVX2-NEXT: vmovaps (%rsp), %ymm0 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: leaq -40(%rbp), %rsp @@ -3323,10 +3277,10 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i ; AVX2-NEXT: movq %rsp, %rbp ; AVX2-NEXT: andq $-32, %rsp ; AVX2-NEXT: subq $288, %rsp # imm = 0x120 -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx ; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 ; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 +; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $edx killed $edx def $rdx ; AVX2-NEXT: # kill: def $esi killed $esi def $rsi ; AVX2-NEXT: movq %rdi, %rax ; AVX2-NEXT: vmovss %xmm0, (%rsp) @@ -3344,413 +3298,355 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i ; AVX2-NEXT: vmovss %xmm0, (%rsp,%r8,4) ; AVX2-NEXT: andl $1, %r9d ; AVX2-NEXT: addl %r8d, %r9d -; AVX2-NEXT: movzbl 16(%rbp), %ecx ; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%r9,4) -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 16(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %r9d, %ecx -; AVX2-NEXT: movzbl 24(%rbp), %edx ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 24(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: movzbl 32(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 32(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 40(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 40(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vmovss %xmm1, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 48(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 48(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $1, %xmm1, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 56(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 56(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm1, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 64(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 64(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $3, %xmm1, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 72(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 72(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 80(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 80(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 88(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 88(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 96(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 96(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 104(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 104(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vmovss %xmm2, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 112(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 112(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $1, %xmm2, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 120(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 120(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm2, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 128(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 128(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $3, %xmm2, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 136(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 136(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm0 ; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 144(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 144(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 152(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 152(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 160(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 160(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 168(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 168(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vmovss %xmm3, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 176(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 176(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $1, %xmm3, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 184(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 184(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm3, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 192(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 192(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $3, %xmm3, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 200(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 200(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm0 ; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 208(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 208(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 216(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 216(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 224(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 224(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 232(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 232(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vmovss %xmm4, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 240(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 240(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $1, %xmm4, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 248(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 248(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm4, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 256(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 256(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $3, %xmm4, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 264(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 264(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm0 ; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 272(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 272(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 280(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 280(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 288(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 288(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 296(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 296(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vmovss %xmm5, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 304(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 304(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $1, %xmm5, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 312(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 312(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm5, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 320(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 320(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $3, %xmm5, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 328(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 328(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm0 ; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 336(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 336(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 344(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 344(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 352(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 352(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 360(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 360(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vmovss %xmm6, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 368(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 368(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $1, %xmm6, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 376(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 376(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm6, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 384(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 384(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $3, %xmm6, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 392(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 392(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm0 ; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 400(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 400(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 408(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 408(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 416(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 416(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 424(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 424(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vmovss %xmm7, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 432(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 432(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $1, %xmm7, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 440(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 440(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm7, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 448(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 448(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $3, %xmm7, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 456(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 456(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm0 ; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 464(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: movl 464(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $63, %edx ; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 472(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movl 472(%rbp), %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) ; AVX2-NEXT: andl $63, %edx @@ -4748,6 +4644,17 @@ define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) nounwind { ; AVX2-NEXT: vpextrb $3, %xmm1, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: vpextrb $4, %xmm0, -24(%rsp,%rcx) +; AVX2-NEXT: vpextrb $5, %xmm0, -24(%rsp,%rcx) +; AVX2-NEXT: vpextrb $6, %xmm0, -24(%rsp,%rcx) +; AVX2-NEXT: vpextrb $7, %xmm0, -24(%rsp,%rcx) +; AVX2-NEXT: vpextrb $8, %xmm0, -24(%rsp,%rcx) +; AVX2-NEXT: vpextrb $9, %xmm0, -24(%rsp,%rcx) +; AVX2-NEXT: vpextrb $10, %xmm0, -24(%rsp,%rcx) +; AVX2-NEXT: vpextrb $11, %xmm0, -24(%rsp,%rcx) +; AVX2-NEXT: vpextrb $12, %xmm0, -24(%rsp,%rcx) +; AVX2-NEXT: vpextrb $13, %xmm0, -24(%rsp,%rcx) +; AVX2-NEXT: vpextrb $14, %xmm0, -24(%rsp,%rcx) ; AVX2-NEXT: vpextrb $15, %xmm0, -24(%rsp,%rcx) ; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll index 81c4d5d71084c..560d5be284f15 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll @@ -12,6 +12,9 @@ define void @load_1byte_chunk_of_2byte_alloca_with_zero_upper_half(ptr %src, i64 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half: ; X64-NO-BMI2: # %bb.0: ; X64-NO-BMI2-NEXT: movzbl (%rdi), %eax +; X64-NO-BMI2-NEXT: movd %eax, %xmm0 +; X64-NO-BMI2-NEXT: movd %xmm0, %eax +; X64-NO-BMI2-NEXT: movzwl %ax, %eax ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shrl %cl, %eax @@ -21,6 +24,9 @@ define void @load_1byte_chunk_of_2byte_alloca_with_zero_upper_half(ptr %src, i64 ; X64-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half: ; X64-BMI2: # %bb.0: ; X64-BMI2-NEXT: movzbl (%rdi), %eax +; X64-BMI2-NEXT: movd %eax, %xmm0 +; X64-BMI2-NEXT: movd %xmm0, %eax +; X64-BMI2-NEXT: movzwl %ax, %eax ; X64-BMI2-NEXT: shll $3, %esi ; X64-BMI2-NEXT: shrxl %esi, %eax, %eax ; X64-BMI2-NEXT: movb %al, (%rdx) @@ -28,14 +34,17 @@ define void @load_1byte_chunk_of_2byte_alloca_with_zero_upper_half(ptr %src, i64 ; ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half: ; X86-NO-BMI2: # %bb.0: -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NEXT: movzbl (%eax), %eax +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NEXT: movzbl (%edx), %edx +; X86-NO-BMI2-NEXT: movd %edx, %xmm0 +; X86-NO-BMI2-NEXT: movd %xmm0, %edx +; X86-NO-BMI2-NEXT: movzwl %dx, %edx ; X86-NO-BMI2-NEXT: shll $3, %ecx ; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NEXT: movb %al, (%edx) +; X86-NO-BMI2-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NEXT: movb %dl, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half: @@ -44,6 +53,9 @@ define void @load_1byte_chunk_of_2byte_alloca_with_zero_upper_half(ptr %src, i64 ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movzbl (%edx), %edx +; X86-BMI2-NEXT: movd %edx, %xmm0 +; X86-BMI2-NEXT: movd %xmm0, %edx +; X86-BMI2-NEXT: movzwl %dx, %edx ; X86-BMI2-NEXT: shll $3, %ecx ; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx ; X86-BMI2-NEXT: movb %cl, (%eax) @@ -65,6 +77,10 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half: ; X64-NO-BMI2: # %bb.0: ; X64-NO-BMI2-NEXT: movzwl (%rdi), %eax +; X64-NO-BMI2-NEXT: movd %eax, %xmm0 +; X64-NO-BMI2-NEXT: pxor %xmm1, %xmm1 +; X64-NO-BMI2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NO-BMI2-NEXT: movd %xmm0, %eax ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shrl %cl, %eax @@ -74,6 +90,10 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64 ; X64-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half: ; X64-BMI2: # %bb.0: ; X64-BMI2-NEXT: movzwl (%rdi), %eax +; X64-BMI2-NEXT: movd %eax, %xmm0 +; X64-BMI2-NEXT: pxor %xmm1, %xmm1 +; X64-BMI2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-BMI2-NEXT: movd %xmm0, %eax ; X64-BMI2-NEXT: shll $3, %esi ; X64-BMI2-NEXT: shrxl %esi, %eax, %eax ; X64-BMI2-NEXT: movb %al, (%rdx) @@ -81,14 +101,18 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64 ; ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half: ; X86-NO-BMI2: # %bb.0: -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NEXT: movzwl (%eax), %eax +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NEXT: movzwl (%edx), %edx +; X86-NO-BMI2-NEXT: movd %edx, %xmm0 +; X86-NO-BMI2-NEXT: pxor %xmm1, %xmm1 +; X86-NO-BMI2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-NO-BMI2-NEXT: movd %xmm0, %edx ; X86-NO-BMI2-NEXT: shll $3, %ecx ; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NEXT: movb %al, (%edx) +; X86-NO-BMI2-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NEXT: movb %dl, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half: @@ -97,6 +121,10 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64 ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movzwl (%edx), %edx +; X86-BMI2-NEXT: movd %edx, %xmm0 +; X86-BMI2-NEXT: pxor %xmm1, %xmm1 +; X86-BMI2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-BMI2-NEXT: movd %xmm0, %edx ; X86-BMI2-NEXT: shll $3, %ecx ; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx ; X86-BMI2-NEXT: movb %cl, (%eax) @@ -119,6 +147,10 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half: ; X64-NO-BMI2: # %bb.0: ; X64-NO-BMI2-NEXT: movzwl (%rdi), %eax +; X64-NO-BMI2-NEXT: movd %eax, %xmm0 +; X64-NO-BMI2-NEXT: pxor %xmm1, %xmm1 +; X64-NO-BMI2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NO-BMI2-NEXT: movd %xmm0, %eax ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shrl %cl, %eax @@ -128,6 +160,10 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64 ; X64-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half: ; X64-BMI2: # %bb.0: ; X64-BMI2-NEXT: movzwl (%rdi), %eax +; X64-BMI2-NEXT: movd %eax, %xmm0 +; X64-BMI2-NEXT: pxor %xmm1, %xmm1 +; X64-BMI2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-BMI2-NEXT: movd %xmm0, %eax ; X64-BMI2-NEXT: shll $3, %esi ; X64-BMI2-NEXT: shrxl %esi, %eax, %eax ; X64-BMI2-NEXT: movw %ax, (%rdx) @@ -139,6 +175,10 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64 ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NEXT: movzwl (%edx), %edx +; X86-NO-BMI2-NEXT: movd %edx, %xmm0 +; X86-NO-BMI2-NEXT: pxor %xmm1, %xmm1 +; X86-NO-BMI2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-NO-BMI2-NEXT: movd %xmm0, %edx ; X86-NO-BMI2-NEXT: shll $3, %ecx ; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NEXT: shrl %cl, %edx @@ -151,6 +191,10 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64 ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movzwl (%edx), %edx +; X86-BMI2-NEXT: movd %edx, %xmm0 +; X86-BMI2-NEXT: pxor %xmm1, %xmm1 +; X86-BMI2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-BMI2-NEXT: movd %xmm0, %edx ; X86-BMI2-NEXT: shll $3, %ecx ; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx ; X86-BMI2-NEXT: movw %cx, (%eax) @@ -171,8 +215,9 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64 define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half: ; X64-NO-BMI2: # %bb.0: +; X64-NO-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx -; X64-NO-BMI2-NEXT: movl (%rdi), %eax +; X64-NO-BMI2-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NEXT: movb %al, (%rdx) @@ -180,8 +225,9 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 ; ; X64-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half: ; X64-BMI2: # %bb.0: +; X64-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-BMI2-NEXT: shll $3, %esi -; X64-BMI2-NEXT: movl (%rdi), %eax +; X64-BMI2-NEXT: movq %xmm0, %rax ; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax ; X64-BMI2-NEXT: movb %al, (%rdx) ; X64-BMI2-NEXT: retq @@ -248,8 +294,9 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half: ; X64-NO-BMI2: # %bb.0: +; X64-NO-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx -; X64-NO-BMI2-NEXT: movl (%rdi), %eax +; X64-NO-BMI2-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NEXT: movw %ax, (%rdx) @@ -257,8 +304,9 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 ; ; X64-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half: ; X64-BMI2: # %bb.0: +; X64-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-BMI2-NEXT: shll $3, %esi -; X64-BMI2-NEXT: movl (%rdi), %eax +; X64-BMI2-NEXT: movq %xmm0, %rax ; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax ; X64-BMI2-NEXT: movw %ax, (%rdx) ; X64-BMI2-NEXT: retq @@ -324,8 +372,9 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half: ; X64-NO-BMI2: # %bb.0: +; X64-NO-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx -; X64-NO-BMI2-NEXT: movl (%rdi), %eax +; X64-NO-BMI2-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NEXT: movl %eax, (%rdx) @@ -333,8 +382,9 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 ; ; X64-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half: ; X64-BMI2: # %bb.0: +; X64-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-BMI2-NEXT: shll $3, %esi -; X64-BMI2-NEXT: movl (%rdi), %eax +; X64-BMI2-NEXT: movq %xmm0, %rax ; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax ; X64-BMI2-NEXT: movl %eax, (%rdx) ; X64-BMI2-NEXT: retq @@ -400,38 +450,73 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: leal (%rax,%rax), %r8d +; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orl %edi, %r8d +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax -; X64-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %cl -; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: movb %sil, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil +; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %r8, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movb %al, (%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; -; X64-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half: -; X64-SHLD: # %bb.0: -; X64-SHLD-NEXT: movq %rsi, %rcx -; X64-SHLD-NEXT: movq (%rdi), %rax -; X64-SHLD-NEXT: shll $3, %ecx -; X64-SHLD-NEXT: xorl %esi, %esi -; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-SHLD-NEXT: testb $64, %cl -; X64-SHLD-NEXT: cmovneq %rsi, %rax -; X64-SHLD-NEXT: movb %al, (%rdx) -; X64-SHLD-NEXT: retq +; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X64-NO-BMI2-HAVE-SHLD: # %bb.0: +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: testb $64, %cl +; X64-NO-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movb %sil, (%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, (%rdi), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorl %ecx, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil +; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (%rcx,%rcx), %r8d +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; +; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: testb $64, %cl +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movb %sil, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq +; ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-NO-SHLD: # %bb.0: ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi @@ -439,12 +524,11 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl @@ -469,12 +553,11 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-SHLD-NEXT: movl %ecx, %edx ; X86-SHLD-NEXT: shrb $3, %dl ; X86-SHLD-NEXT: andb $12, %dl @@ -495,12 +578,11 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl @@ -532,38 +614,73 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: leal (%rax,%rax), %r8d +; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orl %edi, %r8d +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax -; X64-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %cl -; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: movw %si, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil +; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %r8, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movw %ax, (%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; -; X64-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half: -; X64-SHLD: # %bb.0: -; X64-SHLD-NEXT: movq %rsi, %rcx -; X64-SHLD-NEXT: movq (%rdi), %rax -; X64-SHLD-NEXT: shll $3, %ecx -; X64-SHLD-NEXT: xorl %esi, %esi -; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-SHLD-NEXT: testb $64, %cl -; X64-SHLD-NEXT: cmovneq %rsi, %rax -; X64-SHLD-NEXT: movw %ax, (%rdx) -; X64-SHLD-NEXT: retq +; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X64-NO-BMI2-HAVE-SHLD: # %bb.0: +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: testb $64, %cl +; X64-NO-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movw %si, (%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, (%rdi), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorl %ecx, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil +; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (%rcx,%rcx), %r8d +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; +; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: testb $64, %cl +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq +; ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-NO-SHLD: # %bb.0: ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi @@ -571,12 +688,11 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl @@ -601,12 +717,11 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-SHLD-NEXT: movl %ecx, %edx ; X86-SHLD-NEXT: shrb $3, %dl ; X86-SHLD-NEXT: andb $12, %dl @@ -627,12 +742,11 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl @@ -663,38 +777,73 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: leal (%rax,%rax), %r8d +; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orl %edi, %r8d +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax -; X64-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %cl -; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil +; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %r8, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; -; X64-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half: -; X64-SHLD: # %bb.0: -; X64-SHLD-NEXT: movq %rsi, %rcx -; X64-SHLD-NEXT: movq (%rdi), %rax -; X64-SHLD-NEXT: shll $3, %ecx -; X64-SHLD-NEXT: xorl %esi, %esi -; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-SHLD-NEXT: testb $64, %cl -; X64-SHLD-NEXT: cmovneq %rsi, %rax -; X64-SHLD-NEXT: movl %eax, (%rdx) -; X64-SHLD-NEXT: retq +; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X64-NO-BMI2-HAVE-SHLD: # %bb.0: +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: testb $64, %cl +; X64-NO-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, (%rdi), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorl %ecx, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil +; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (%rcx,%rcx), %r8d +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; +; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: testb $64, %cl +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq +; ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-NO-SHLD: # %bb.0: ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi @@ -702,12 +851,11 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl @@ -732,12 +880,11 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-SHLD-NEXT: movl %ecx, %edx ; X86-SHLD-NEXT: shrb $3, %dl ; X86-SHLD-NEXT: andb $12, %dl @@ -758,12 +905,11 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl @@ -794,38 +940,73 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax -; X64-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %cl -; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil +; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %r8, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; -; X64-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half: -; X64-SHLD: # %bb.0: -; X64-SHLD-NEXT: movq %rsi, %rcx -; X64-SHLD-NEXT: movq (%rdi), %rax -; X64-SHLD-NEXT: shll $3, %ecx -; X64-SHLD-NEXT: xorl %esi, %esi -; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-SHLD-NEXT: testb $64, %cl -; X64-SHLD-NEXT: cmovneq %rsi, %rax -; X64-SHLD-NEXT: movq %rax, (%rdx) -; X64-SHLD-NEXT: retq +; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X64-NO-BMI2-HAVE-SHLD: # %bb.0: +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: testb $64, %cl +; X64-NO-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, (%rdi), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorl %ecx, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rdi, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; +; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: testb $64, %cl +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq +; ; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-NO-SHLD: # %bb.0: ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp @@ -836,12 +1017,11 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl @@ -881,12 +1061,11 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-SHLD-NEXT: movl %ecx, %edx ; X86-SHLD-NEXT: shrb $3, %dl ; X86-SHLD-NEXT: andb $12, %dl @@ -916,12 +1095,11 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl @@ -964,13 +1142,13 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X64-NO-BMI2: # %bb.0: -; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 -; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1 +; X64-NO-BMI2-NEXT: xorps %xmm0, %xmm0 ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx -; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movups (%rdi), %xmm1 +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movl %ecx, %eax ; X64-NO-BMI2-NEXT: shrb $6, %al ; X64-NO-BMI2-NEXT: movzbl %al, %eax @@ -982,13 +1160,13 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X64-BMI2: # %bb.0: -; X64-BMI2-NEXT: movups (%rdi), %xmm0 -; X64-BMI2-NEXT: xorps %xmm1, %xmm1 +; X64-BMI2-NEXT: xorps %xmm0, %xmm0 ; X64-BMI2-NEXT: shll $3, %esi -; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movups (%rdi), %xmm1 ; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: movl %esi, %eax ; X64-BMI2-NEXT: shrb $6, %al ; X64-BMI2-NEXT: movzbl %al, %eax @@ -1003,13 +1181,13 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx @@ -1033,13 +1211,13 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 -; X86-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-SHLD-NEXT: shll $3, %ecx -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movups (%edx), %xmm1 +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-SHLD-NEXT: movl %ecx, %edx ; X86-SHLD-NEXT: shrb $5, %dl ; X86-SHLD-NEXT: movzbl %dl, %edx @@ -1059,13 +1237,13 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx @@ -1096,13 +1274,13 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X64-NO-BMI2: # %bb.0: -; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 -; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1 +; X64-NO-BMI2-NEXT: xorps %xmm0, %xmm0 ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx -; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movups (%rdi), %xmm1 +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movl %ecx, %eax ; X64-NO-BMI2-NEXT: shrb $6, %al ; X64-NO-BMI2-NEXT: movzbl %al, %eax @@ -1120,13 +1298,13 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X64-BMI2: # %bb.0: -; X64-BMI2-NEXT: movups (%rdi), %xmm0 -; X64-BMI2-NEXT: xorps %xmm1, %xmm1 +; X64-BMI2-NEXT: xorps %xmm0, %xmm0 ; X64-BMI2-NEXT: shll $3, %esi -; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movups (%rdi), %xmm1 ; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: movl %esi, %eax ; X64-BMI2-NEXT: shrb $6, %al ; X64-BMI2-NEXT: movzbl %al, %eax @@ -1148,13 +1326,13 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx @@ -1178,13 +1356,13 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 -; X86-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-SHLD-NEXT: shll $3, %ecx -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movups (%edx), %xmm1 +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-SHLD-NEXT: movl %ecx, %edx ; X86-SHLD-NEXT: shrb $5, %dl ; X86-SHLD-NEXT: movzbl %dl, %edx @@ -1204,13 +1382,13 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx @@ -1240,13 +1418,13 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X64-NO-BMI2: # %bb.0: -; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 -; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1 +; X64-NO-BMI2-NEXT: xorps %xmm0, %xmm0 ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx -; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movups (%rdi), %xmm1 +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movl %ecx, %eax ; X64-NO-BMI2-NEXT: shrb $6, %al ; X64-NO-BMI2-NEXT: movzbl %al, %eax @@ -1264,13 +1442,13 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X64-BMI2: # %bb.0: -; X64-BMI2-NEXT: movups (%rdi), %xmm0 -; X64-BMI2-NEXT: xorps %xmm1, %xmm1 +; X64-BMI2-NEXT: xorps %xmm0, %xmm0 ; X64-BMI2-NEXT: shll $3, %esi -; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movups (%rdi), %xmm1 ; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: movl %esi, %eax ; X64-BMI2-NEXT: shrb $6, %al ; X64-BMI2-NEXT: movzbl %al, %eax @@ -1292,13 +1470,13 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx @@ -1322,13 +1500,13 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 -; X86-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-SHLD-NEXT: shll $3, %ecx -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movups (%edx), %xmm1 +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-SHLD-NEXT: movl %ecx, %edx ; X86-SHLD-NEXT: shrb $5, %dl ; X86-SHLD-NEXT: movzbl %dl, %edx @@ -1348,13 +1526,13 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx @@ -1384,13 +1562,13 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 -; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1 +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %al ; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax @@ -1407,13 +1585,13 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X64-SHLD: # %bb.0: -; X64-SHLD-NEXT: movups (%rdi), %xmm0 -; X64-SHLD-NEXT: xorps %xmm1, %xmm1 +; X64-SHLD-NEXT: xorps %xmm0, %xmm0 ; X64-SHLD-NEXT: leal (,%rsi,8), %ecx -; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movups (%rdi), %xmm1 ; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-SHLD-NEXT: movl %ecx, %eax ; X64-SHLD-NEXT: shrb $6, %al ; X64-SHLD-NEXT: movzbl %al, %eax @@ -1426,13 +1604,13 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax @@ -1455,13 +1633,13 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx @@ -1500,13 +1678,13 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 -; X86-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-SHLD-NEXT: shll $3, %ecx -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movups (%edx), %xmm1 +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-SHLD-NEXT: movl %ecx, %edx ; X86-SHLD-NEXT: shrb $5, %dl ; X86-SHLD-NEXT: movzbl %dl, %edx @@ -1535,13 +1713,13 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx @@ -1583,13 +1761,13 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 -; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1 ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl ; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi @@ -1616,13 +1794,13 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: -; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 -; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm1, %xmm1 +; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm1 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %cl ; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %esi @@ -1644,13 +1822,13 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax @@ -1673,15 +1851,15 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm1, %xmm1 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm1 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi @@ -1707,13 +1885,13 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X86-NO-BMI2-NO-SHLD-NEXT: subl $92, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi @@ -1773,13 +1951,13 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X86-SHLD-NEXT: subl $92, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movups (%eax), %xmm0 -; X86-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-SHLD-NEXT: shll $3, %ecx -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movups (%eax), %xmm1 +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movl %ecx, %eax ; X86-SHLD-NEXT: shrb $5, %al ; X86-SHLD-NEXT: movzbl %al, %ebx @@ -1816,13 +1994,13 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $92, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx @@ -1881,17 +2059,17 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-NO-BMI2: # %bb.0: ; X64-NO-BMI2-NEXT: pushq %rax -; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 -; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1 -; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2 -; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-BMI2-NEXT: movups (%rdi), %xmm1 +; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm2 +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx ; X64-NO-BMI2-NEXT: andl $56, %ecx ; X64-NO-BMI2-NEXT: andl $56, %esi @@ -1910,17 +2088,17 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-BMI2: # %bb.0: ; X64-BMI2-NEXT: pushq %rax -; X64-BMI2-NEXT: movups (%rdi), %xmm0 -; X64-BMI2-NEXT: movups 16(%rdi), %xmm1 -; X64-BMI2-NEXT: xorps %xmm2, %xmm2 -; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: xorps %xmm0, %xmm0 +; X64-BMI2-NEXT: movups (%rdi), %xmm1 +; X64-BMI2-NEXT: movups 16(%rdi), %xmm2 +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-BMI2-NEXT: andl $56, %eax ; X64-BMI2-NEXT: andl $56, %esi @@ -1942,17 +2120,17 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 -; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi @@ -1975,17 +2153,17 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SHLD-NEXT: movups (%ecx), %xmm0 -; X86-SHLD-NEXT: movups 16(%ecx), %xmm1 -; X86-SHLD-NEXT: xorps %xmm2, %xmm2 -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-SHLD-NEXT: movups (%ecx), %xmm1 +; X86-SHLD-NEXT: movups 16(%ecx), %xmm2 +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-SHLD-NEXT: leal (,%edx,8), %ecx ; X86-SHLD-NEXT: andl $60, %edx ; X86-SHLD-NEXT: movl (%esp,%edx), %ebx @@ -2004,17 +2182,17 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi @@ -2045,17 +2223,17 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-NO-BMI2: # %bb.0: ; X64-NO-BMI2-NEXT: pushq %rax -; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 -; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1 -; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2 -; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-BMI2-NEXT: movups (%rdi), %xmm1 +; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm2 +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx ; X64-NO-BMI2-NEXT: andl $56, %ecx ; X64-NO-BMI2-NEXT: andl $56, %esi @@ -2074,17 +2252,17 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-BMI2: # %bb.0: ; X64-BMI2-NEXT: pushq %rax -; X64-BMI2-NEXT: movups (%rdi), %xmm0 -; X64-BMI2-NEXT: movups 16(%rdi), %xmm1 -; X64-BMI2-NEXT: xorps %xmm2, %xmm2 -; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: xorps %xmm0, %xmm0 +; X64-BMI2-NEXT: movups (%rdi), %xmm1 +; X64-BMI2-NEXT: movups 16(%rdi), %xmm2 +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-BMI2-NEXT: andl $56, %eax ; X64-BMI2-NEXT: andl $56, %esi @@ -2106,17 +2284,17 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 -; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi @@ -2139,17 +2317,17 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SHLD-NEXT: movups (%ecx), %xmm0 -; X86-SHLD-NEXT: movups 16(%ecx), %xmm1 -; X86-SHLD-NEXT: xorps %xmm2, %xmm2 -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-SHLD-NEXT: movups (%ecx), %xmm1 +; X86-SHLD-NEXT: movups 16(%ecx), %xmm2 +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-SHLD-NEXT: leal (,%edx,8), %ecx ; X86-SHLD-NEXT: andl $60, %edx ; X86-SHLD-NEXT: movl (%esp,%edx), %esi @@ -2168,17 +2346,17 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi @@ -2208,17 +2386,17 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-NO-BMI2: # %bb.0: ; X64-NO-BMI2-NEXT: pushq %rax -; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 -; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1 -; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2 -; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-BMI2-NEXT: movups (%rdi), %xmm1 +; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm2 +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx ; X64-NO-BMI2-NEXT: andl $56, %ecx ; X64-NO-BMI2-NEXT: andl $56, %esi @@ -2237,17 +2415,17 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-BMI2: # %bb.0: ; X64-BMI2-NEXT: pushq %rax -; X64-BMI2-NEXT: movups (%rdi), %xmm0 -; X64-BMI2-NEXT: movups 16(%rdi), %xmm1 -; X64-BMI2-NEXT: xorps %xmm2, %xmm2 -; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: xorps %xmm0, %xmm0 +; X64-BMI2-NEXT: movups (%rdi), %xmm1 +; X64-BMI2-NEXT: movups 16(%rdi), %xmm2 +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-BMI2-NEXT: andl $56, %eax ; X64-BMI2-NEXT: andl $56, %esi @@ -2269,17 +2447,17 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 -; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi @@ -2302,17 +2480,17 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SHLD-NEXT: movups (%ecx), %xmm0 -; X86-SHLD-NEXT: movups 16(%ecx), %xmm1 -; X86-SHLD-NEXT: xorps %xmm2, %xmm2 -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-SHLD-NEXT: movups (%ecx), %xmm1 +; X86-SHLD-NEXT: movups 16(%ecx), %xmm2 +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-SHLD-NEXT: leal (,%edx,8), %ecx ; X86-SHLD-NEXT: andl $60, %edx ; X86-SHLD-NEXT: movl (%esp,%edx), %esi @@ -2331,17 +2509,17 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi @@ -2371,17 +2549,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-NO-BMI2-NO-SHLD: # %bb.0: ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax -; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 -; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 -; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1 +; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm2 +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %rax @@ -2399,17 +2577,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-SHLD: # %bb.0: ; X64-SHLD-NEXT: pushq %rax -; X64-SHLD-NEXT: movups (%rdi), %xmm0 -; X64-SHLD-NEXT: movups 16(%rdi), %xmm1 -; X64-SHLD-NEXT: xorps %xmm2, %xmm2 -; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-SHLD-NEXT: movups (%rdi), %xmm1 +; X64-SHLD-NEXT: movups 16(%rdi), %xmm2 +; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SHLD-NEXT: leal (,%rsi,8), %ecx ; X64-SHLD-NEXT: andl $56, %esi ; X64-SHLD-NEXT: movq -128(%rsp,%rsi), %rax @@ -2423,17 +2601,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm2 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx @@ -2455,17 +2633,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: subl $140, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 -; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %esi @@ -2506,17 +2684,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 -; X86-SHLD-NEXT: movups 16(%edx), %xmm1 -; X86-SHLD-NEXT: xorps %xmm2, %xmm2 -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-SHLD-NEXT: movups (%edx), %xmm1 +; X86-SHLD-NEXT: movups 16(%edx), %xmm2 +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-SHLD-NEXT: movl %ecx, %esi ; X86-SHLD-NEXT: andl $60, %esi ; X86-SHLD-NEXT: movl 8(%esp,%esi), %edi @@ -2545,17 +2723,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx @@ -2595,17 +2773,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-NO-BMI2-NO-SHLD: # %bb.0: ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax -; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 -; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 -; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1 +; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm2 +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi @@ -2634,17 +2812,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 -; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm1 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm2 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %edi @@ -2670,17 +2848,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm2 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx @@ -2706,17 +2884,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm1 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm2 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %ecx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax @@ -2745,17 +2923,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-NO-BMI2-NO-SHLD-NEXT: subl $156, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 -; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %ebx @@ -2816,17 +2994,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-SHLD-NEXT: subl $156, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movups (%eax), %xmm0 -; X86-SHLD-NEXT: movups 16(%eax), %xmm1 -; X86-SHLD-NEXT: xorps %xmm2, %xmm2 -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-SHLD-NEXT: movups (%eax), %xmm1 +; X86-SHLD-NEXT: movups 16(%eax), %xmm2 +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movl %ecx, %edi ; X86-SHLD-NEXT: andl $60, %edi ; X86-SHLD-NEXT: movl 24(%esp,%edi), %esi @@ -2864,17 +3042,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $156, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax @@ -2931,17 +3109,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax -; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 -; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 -; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1 +; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm2 +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi @@ -2993,17 +3171,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 -; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm1 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm2 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %edi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax @@ -3046,17 +3224,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm2 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx @@ -3097,17 +3275,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm1 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm2 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %ecx @@ -3146,17 +3324,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-NO-BMI2-NO-SHLD-NEXT: subl $172, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 -; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebx @@ -3257,17 +3435,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-SHLD-NEXT: subl $156, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movups (%eax), %xmm0 -; X86-SHLD-NEXT: movups 16(%eax), %xmm1 -; X86-SHLD-NEXT: xorps %xmm2, %xmm2 -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-SHLD-NEXT: movups (%eax), %xmm1 +; X86-SHLD-NEXT: movups 16(%eax), %xmm2 +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movl %ecx, %edi ; X86-SHLD-NEXT: andl $60, %edi ; X86-SHLD-NEXT: movl 24(%esp,%edi), %edx @@ -3324,17 +3502,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $156, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll index 8d36eef952a2b..c4c87086dc359 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll @@ -168,8 +168,8 @@ define void @load_2byte_chunk_of_4byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca: ; X64-NO-BMI2: # %bb.0: -; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NEXT: movb %al, (%rdx) @@ -188,17 +188,15 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx @@ -215,13 +213,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl @@ -236,14 +232,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi @@ -260,23 +253,19 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca: ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %bl, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %dl, (%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 %byteOff.numbits = shl nuw nsw i64 %byteOff, 3 @@ -292,8 +281,8 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca: ; X64-NO-BMI2: # %bb.0: -; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NEXT: movw %ax, (%rdx) @@ -312,17 +301,15 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx @@ -339,18 +326,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movw %dx, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: retl ; @@ -360,14 +345,11 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi @@ -386,18 +368,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %dx, (%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 @@ -413,8 +393,8 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca: ; X64-NO-BMI2: # %bb.0: -; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NEXT: movl %eax, (%rdx) @@ -433,17 +413,15 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx @@ -460,18 +438,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: retl ; @@ -481,14 +457,11 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi @@ -507,18 +480,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 @@ -536,8 +507,8 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi @@ -557,8 +528,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -571,8 +542,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx @@ -591,8 +562,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -610,8 +581,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) @@ -639,8 +610,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 ; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: movups (%edx), %xmm0 ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) @@ -664,8 +635,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) @@ -698,8 +669,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi @@ -719,8 +690,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -733,8 +704,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx @@ -753,8 +724,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -772,8 +743,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) @@ -801,8 +772,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 ; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: movups (%edx), %xmm0 ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) @@ -826,8 +797,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) @@ -859,8 +830,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi @@ -880,8 +851,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -894,8 +865,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx @@ -914,8 +885,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -933,8 +904,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) @@ -962,8 +933,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 ; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: movups (%edx), %xmm0 ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) @@ -987,8 +958,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) @@ -1020,8 +991,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi @@ -1041,8 +1012,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -1055,8 +1026,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx @@ -1075,8 +1046,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -1097,8 +1068,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) @@ -1141,8 +1112,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 ; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: movups (%edx), %xmm0 ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) @@ -1175,8 +1146,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) @@ -1222,9 +1193,9 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca: ; X64-NO-BMI2: # %bb.0: +; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx ; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 ; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1 -; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx ; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2 ; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) @@ -1241,9 +1212,9 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca: ; X64-BMI2: # %bb.0: +; X64-BMI2-NEXT: shll $3, %esi ; X64-BMI2-NEXT: movups (%rdi), %xmm0 ; X64-BMI2-NEXT: movups 16(%rdi), %xmm1 -; X64-BMI2-NEXT: shll $3, %esi ; X64-BMI2-NEXT: xorps %xmm2, %xmm2 ; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) @@ -1263,9 +1234,9 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -1294,9 +1265,9 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: movups (%edx), %xmm0 ; X86-SHLD-NEXT: movups 16(%edx), %xmm1 -; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -1321,9 +1292,9 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -1357,9 +1328,9 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca: ; X64-NO-BMI2: # %bb.0: +; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx ; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 ; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1 -; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx ; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2 ; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) @@ -1382,9 +1353,9 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca: ; X64-BMI2: # %bb.0: +; X64-BMI2-NEXT: shll $3, %esi ; X64-BMI2-NEXT: movups (%rdi), %xmm0 ; X64-BMI2-NEXT: movups 16(%rdi), %xmm1 -; X64-BMI2-NEXT: shll $3, %esi ; X64-BMI2-NEXT: xorps %xmm2, %xmm2 ; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) @@ -1411,9 +1382,9 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -1442,9 +1413,9 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: movups (%edx), %xmm0 ; X86-SHLD-NEXT: movups 16(%edx), %xmm1 -; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -1469,9 +1440,9 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -1504,9 +1475,9 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca: ; X64-NO-BMI2: # %bb.0: +; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx ; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 ; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1 -; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx ; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2 ; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) @@ -1529,9 +1500,9 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca: ; X64-BMI2: # %bb.0: +; X64-BMI2-NEXT: shll $3, %esi ; X64-BMI2-NEXT: movups (%rdi), %xmm0 ; X64-BMI2-NEXT: movups 16(%rdi), %xmm1 -; X64-BMI2-NEXT: shll $3, %esi ; X64-BMI2-NEXT: xorps %xmm2, %xmm2 ; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) @@ -1558,9 +1529,9 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -1589,9 +1560,9 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: movups (%edx), %xmm0 ; X86-SHLD-NEXT: movups 16(%edx), %xmm1 -; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -1616,9 +1587,9 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -1651,9 +1622,9 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca: ; X64-NO-BMI2-NO-SHLD: # %bb.0: +; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 -; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) @@ -1675,9 +1646,9 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca: ; X64-SHLD: # %bb.0: +; X64-SHLD-NEXT: leal (,%rsi,8), %ecx ; X64-SHLD-NEXT: movups (%rdi), %xmm0 ; X64-SHLD-NEXT: movups 16(%rdi), %xmm1 -; X64-SHLD-NEXT: leal (,%rsi,8), %ecx ; X64-SHLD-NEXT: xorps %xmm2, %xmm2 ; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) @@ -1695,9 +1666,9 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: +; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) @@ -1725,9 +1696,9 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -1771,9 +1742,9 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: movups (%edx), %xmm0 ; X86-SHLD-NEXT: movups 16(%edx), %xmm1 -; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -1807,9 +1778,9 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -1854,9 +1825,9 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca: ; X64-NO-BMI2-NO-SHLD: # %bb.0: +; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 -; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) @@ -1888,9 +1859,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: +; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 -; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) @@ -1917,9 +1888,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: +; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) @@ -1947,9 +1918,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) @@ -1982,9 +1953,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X86-NO-BMI2-NO-SHLD-NEXT: subl $92, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -2049,9 +2020,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X86-SHLD-NEXT: subl $92, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: movups (%eax), %xmm0 ; X86-SHLD-NEXT: movups 16(%eax), %xmm1 -; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -2093,9 +2064,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $92, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)