diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index fabcbc5f0e856..1f4ace1b3174d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -808,6 +808,24 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( } break; } + case ISD::SRL: { + // If we are only demanding sign bits then we can use the shift source + // directly. + if (std::optional MaxSA = + DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) { + SDValue Op0 = Op.getOperand(0); + unsigned ShAmt = *MaxSA; + // Must already be signbits in DemandedBits bounds, and can't demand any + // shifted in zeroes. + if (DemandedBits.countl_zero() >= ShAmt) { + unsigned NumSignBits = + DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1); + if (DemandedBits.countr_zero() >= (BitWidth - NumSignBits)) + return Op0; + } + } + break; + } case ISD::SETCC: { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); diff --git a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll index 4143c65a840d7..662de47413654 100644 --- a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll +++ b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll @@ -1052,16 +1052,15 @@ define noundef i64 @srem64_i32max(i64 noundef %i) { ; GFX9-NEXT: s_mov_b32 s6, 0x80000001 ; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v1 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, 3, v[2:3] -; GFX9-NEXT: v_mul_i32_i24_e32 v8, 3, v6 -; GFX9-NEXT: v_lshl_add_u32 v9, v6, 31, v6 -; GFX9-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-NEXT: v_lshl_add_u32 v8, v6, 31, v6 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 -; GFX9-NEXT: v_add3_u32 v7, v7, v9, v8 +; GFX9-NEXT: v_add3_u32 v7, v7, v8, v6 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, -1, v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3] ; GFX9-NEXT: v_sub_u32_e32 v5, v5, v1 @@ -1085,10 +1084,9 @@ define noundef i64 @srem64_i32max(i64 noundef %i) { ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX942-NEXT: v_mul_i32_i24_e32 v4, 3, v2 -; GFX942-NEXT: v_lshl_add_u32 v5, v2, 31, v2 +; GFX942-NEXT: v_lshl_add_u32 v4, v2, 31, v2 ; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, 3, 0 -; GFX942-NEXT: v_add3_u32 v3, v3, v5, v4 +; GFX942-NEXT: v_add3_u32 v3, v3, v4, v2 ; GFX942-NEXT: v_mul_hi_u32 v4, v0, 3 ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v1, 3, v[4:5] @@ -1125,17 +1123,16 @@ define noundef i64 @srem64_i32max(i64 noundef %i) { ; GFX1030-NEXT: v_mul_hi_u32 v2, v0, 3 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GFX1030-NEXT: v_mul_i32_i24_e32 v7, 3, v6 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v1, 3, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v8, v5 +; GFX1030-NEXT: v_mov_b32_e32 v7, v5 ; GFX1030-NEXT: v_mov_b32_e32 v5, v3 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v6, 3, 0 ; GFX1030-NEXT: v_lshl_add_u32 v6, v6, 31, v6 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0x80000001, v0, v[4:5] -; GFX1030-NEXT: v_add3_u32 v3, v3, v6, v7 +; GFX1030-NEXT: v_add3_u32 v3, v3, v6, v2 ; GFX1030-NEXT: v_mov_b32_e32 v4, v5 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, -1, v[2:3] -; GFX1030-NEXT: v_add_co_u32 v4, s4, v8, v4 +; GFX1030-NEXT: v_add_co_u32 v4, s4, v7, v4 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v5, null, 0, 0, s4 ; GFX1030-NEXT: v_sub_nc_u32_e32 v6, v3, v1 ; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x80000001, v1, v[4:5] @@ -1167,16 +1164,15 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) { ; GFX9-NEXT: s_mov_b32 s6, 0x80000001 ; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v1 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, 3, v[2:3] -; GFX9-NEXT: v_mul_i32_i24_e32 v8, 3, v6 -; GFX9-NEXT: v_lshl_add_u32 v9, v6, 31, v6 -; GFX9-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-NEXT: v_lshl_add_u32 v8, v6, 31, v6 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 -; GFX9-NEXT: v_add3_u32 v7, v7, v9, v8 +; GFX9-NEXT: v_add3_u32 v7, v7, v8, v6 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, -1, v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3] ; GFX9-NEXT: v_sub_u32_e32 v5, v5, v1 @@ -1195,10 +1191,9 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) { ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX942-NEXT: v_mul_i32_i24_e32 v4, 3, v2 -; GFX942-NEXT: v_lshl_add_u32 v5, v2, 31, v2 +; GFX942-NEXT: v_lshl_add_u32 v4, v2, 31, v2 ; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, 3, 0 -; GFX942-NEXT: v_add3_u32 v3, v3, v5, v4 +; GFX942-NEXT: v_add3_u32 v3, v3, v4, v2 ; GFX942-NEXT: v_mul_hi_u32 v4, v0, 3 ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v1, 3, v[4:5] @@ -1227,17 +1222,16 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) { ; GFX1030-NEXT: v_mul_hi_u32 v2, v0, 3 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GFX1030-NEXT: v_mul_i32_i24_e32 v7, 3, v6 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v1, 3, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v8, v5 +; GFX1030-NEXT: v_mov_b32_e32 v7, v5 ; GFX1030-NEXT: v_mov_b32_e32 v5, v3 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v6, 3, 0 ; GFX1030-NEXT: v_lshl_add_u32 v6, v6, 31, v6 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0x80000001, v0, v[4:5] -; GFX1030-NEXT: v_add3_u32 v3, v3, v6, v7 +; GFX1030-NEXT: v_add3_u32 v3, v3, v6, v2 ; GFX1030-NEXT: v_mov_b32_e32 v4, v5 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, -1, v[2:3] -; GFX1030-NEXT: v_add_co_u32 v4, s4, v8, v4 +; GFX1030-NEXT: v_add_co_u32 v4, s4, v7, v4 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v5, null, 0, 0, s4 ; GFX1030-NEXT: v_sub_nc_u32_e32 v6, v3, v1 ; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x80000001, v1, v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 786fe03164690..68ebc21e2ba4d 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -37,12 +37,11 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 -; SDAG-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: s_mov_b64 s[4:5], 0x432 ; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -62,34 +61,33 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v11, 0 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v2, 0 +; SDAG-NEXT: v_mul_lo_u32 v12, v8, v2 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 -; SDAG-NEXT: v_mul_lo_u32 v6, v11, v6 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v11, v[2:3] -; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12 -; SDAG-NEXT: v_add3_u32 v5, v5, v6, v13 +; SDAG-NEXT: v_mul_lo_u32 v6, v10, v6 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v10, v[2:3] +; SDAG-NEXT: v_mul_lo_u32 v10, v9, v7 +; SDAG-NEXT: v_add3_u32 v5, v5, v6, v12 ; SDAG-NEXT: v_mov_b32_e32 v6, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v3 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[1:2] -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v12, v[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[4:5] ; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v9, v9, v7 +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v11 ; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] -; SDAG-NEXT: ; implicit-def: $vgpr11 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: v_add3_u32 v4, v10, v4, v9 +; SDAG-NEXT: v_add3_u32 v4, v9, v4, v10 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: .LBB0_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB0_6 @@ -102,9 +100,9 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v6, v0, v4, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v11, 0 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v11, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v10, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v7, v4 ; SDAG-NEXT: v_mov_b32_e32 v4, v2 ; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[3:4] @@ -112,7 +110,7 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v6, v[3:4] ; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3 ; SDAG-NEXT: .LBB0_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] @@ -409,12 +407,11 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 -; SDAG-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: s_mov_b64 s[4:5], 0x432 ; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -434,34 +431,33 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v11, 0 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v2, 0 +; SDAG-NEXT: v_mul_lo_u32 v12, v8, v2 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 -; SDAG-NEXT: v_mul_lo_u32 v6, v11, v6 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v11, v[2:3] -; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12 -; SDAG-NEXT: v_add3_u32 v5, v5, v6, v13 +; SDAG-NEXT: v_mul_lo_u32 v6, v10, v6 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v10, v[2:3] +; SDAG-NEXT: v_mul_lo_u32 v10, v9, v7 +; SDAG-NEXT: v_add3_u32 v5, v5, v6, v12 ; SDAG-NEXT: v_mov_b32_e32 v6, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v3 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[1:2] -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v12, v[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[4:5] ; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v9, v9, v7 +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v11 ; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] -; SDAG-NEXT: ; implicit-def: $vgpr11 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: v_add3_u32 v4, v10, v4, v9 +; SDAG-NEXT: v_add3_u32 v4, v9, v4, v10 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: .LBB1_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB1_6 @@ -474,9 +470,9 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v6, v0, v4, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v11, 0 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v11, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v10, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v7, v4 ; SDAG-NEXT: v_mov_b32_e32 v4, v2 ; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[3:4] @@ -484,7 +480,7 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v6, v[3:4] ; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3 ; SDAG-NEXT: .LBB1_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] @@ -780,7 +776,6 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 -; SDAG-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: s_mov_b64 s[4:5], 0x95 ; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] @@ -806,24 +801,24 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, 0 -; SDAG-NEXT: v_mul_lo_u32 v14, v8, v2 -; SDAG-NEXT: v_mul_lo_u32 v15, v10, v3 +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 +; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 ; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, v[6:7] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v6, v5 ; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v8, v[4:5] -; SDAG-NEXT: v_add3_u32 v3, v3, v15, v14 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v13, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5] +; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] ; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v5 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v9, v12 -; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v8, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11 +; SDAG-NEXT: v_mul_lo_u32 v7, v9, v12 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6] ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: ; implicit-def: $vgpr9 @@ -1138,7 +1133,6 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 -; SDAG-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: s_mov_b64 s[4:5], 0x95 ; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] @@ -1164,24 +1158,24 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, 0 -; SDAG-NEXT: v_mul_lo_u32 v14, v8, v2 -; SDAG-NEXT: v_mul_lo_u32 v15, v10, v3 +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 +; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 ; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, v[6:7] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v6, v5 ; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v8, v[4:5] -; SDAG-NEXT: v_add3_u32 v3, v3, v15, v14 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v13, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5] +; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] ; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v5 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v9, v12 -; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v8, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11 +; SDAG-NEXT: v_mul_lo_u32 v7, v9, v12 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6] ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: ; implicit-def: $vgpr9 @@ -1551,26 +1545,25 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; SDAG-NEXT: v_mul_lo_u32 v13, v9, v2 +; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3 ; SDAG-NEXT: v_mov_b32_e32 v6, v1 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7] -; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3 ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0 -; SDAG-NEXT: v_add_co_u32_e64 v6, s[4:5], -1, v10 -; SDAG-NEXT: v_mov_b32_e32 v10, v5 +; SDAG-NEXT: v_mov_b32_e32 v8, v5 ; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_addc_co_u32_e64 v8, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SDAG-NEXT: v_add_co_u32_e64 v6, s[4:5], -1, v10 ; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 ; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v10, v5 +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 ; SDAG-NEXT: v_mul_lo_u32 v3, v6, v11 +; SDAG-NEXT: v_mul_lo_u32 v7, v6, v12 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v7, v8, v12 ; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6] -; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 +; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 @@ -1903,26 +1896,25 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; SDAG-NEXT: v_mul_lo_u32 v13, v9, v2 +; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3 ; SDAG-NEXT: v_mov_b32_e32 v6, v1 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7] -; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3 ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0 -; SDAG-NEXT: v_add_co_u32_e64 v6, s[4:5], -1, v10 -; SDAG-NEXT: v_mov_b32_e32 v10, v5 +; SDAG-NEXT: v_mov_b32_e32 v8, v5 ; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_addc_co_u32_e64 v8, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SDAG-NEXT: v_add_co_u32_e64 v6, s[4:5], -1, v10 ; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 ; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v10, v5 +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 ; SDAG-NEXT: v_mul_lo_u32 v3, v6, v11 +; SDAG-NEXT: v_mul_lo_u32 v7, v6, v12 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v7, v8, v12 ; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6] -; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 +; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 diff --git a/llvm/test/CodeGen/RISCV/pr95284.ll b/llvm/test/CodeGen/RISCV/pr95284.ll index 135e128c00bac..82600d8d3df51 100644 --- a/llvm/test/CodeGen/RISCV/pr95284.ll +++ b/llvm/test/CodeGen/RISCV/pr95284.ll @@ -6,19 +6,17 @@ define signext i64 @PR95284(i32 signext %0) { ; RV32I-LABEL: PR95284: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: seqz a1, a0 -; RV32I-NEXT: neg a2, a1 -; RV32I-NEXT: addi a0, a0, -1 -; RV32I-NEXT: srli a2, a2, 1 -; RV32I-NEXT: srli a0, a0, 1 -; RV32I-NEXT: slli a1, a1, 31 -; RV32I-NEXT: or a0, a1, a0 -; RV32I-NEXT: addi a0, a0, 1 -; RV32I-NEXT: seqz a1, a0 -; RV32I-NEXT: add a1, a2, a1 -; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: addi a1, a0, -1 +; RV32I-NEXT: seqz a0, a0 +; RV32I-NEXT: slli a2, a0, 31 +; RV32I-NEXT: srli a1, a1, 1 +; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: addi a1, a1, 1 +; RV32I-NEXT: seqz a2, a1 +; RV32I-NEXT: sub a2, a2, a0 +; RV32I-NEXT: andi a0, a1, -2 +; RV32I-NEXT: slli a1, a2, 1 ; RV32I-NEXT: srli a1, a1, 1 -; RV32I-NEXT: andi a0, a0, -2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: PR95284: diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll index 0ee067b673da9..b887036372f7b 100644 --- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll @@ -329,6 +329,7 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s3, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 ; RV32-NEXT: lbu a0, 4(a0) ; RV32-NEXT: lw a1, 0(s0) @@ -351,6 +352,7 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV32-NEXT: addi a0, a0, -1638 ; RV32-NEXT: andi a0, a0, 2047 ; RV32-NEXT: sltiu s1, a0, 2 +; RV32-NEXT: xori s4, s1, 1 ; RV32-NEXT: li a1, 1463 ; RV32-NEXT: mv a0, s2 ; RV32-NEXT: call __mulsi3 @@ -358,23 +360,22 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV32-NEXT: andi a0, a0, 2047 ; RV32-NEXT: sltiu a0, a0, 293 ; RV32-NEXT: addi s3, s3, -1 -; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: addi s1, s1, -1 -; RV32-NEXT: slli a1, s1, 21 -; RV32-NEXT: srli a1, a1, 31 -; RV32-NEXT: andi a2, s3, 2047 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: andi a1, s3, 2047 ; RV32-NEXT: andi a0, a0, 2047 ; RV32-NEXT: slli a0, a0, 11 ; RV32-NEXT: slli s1, s1, 22 ; RV32-NEXT: or a0, a0, s1 -; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: sw a0, 0(s0) -; RV32-NEXT: sb a1, 4(s0) +; RV32-NEXT: sb s4, 4(s0) ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s2, 16(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; @@ -457,16 +458,15 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV32M-NEXT: addi a1, a1, -1638 ; RV32M-NEXT: andi a1, a1, 2047 ; RV32M-NEXT: sltiu a1, a1, 2 -; RV32M-NEXT: li a4, 1463 -; RV32M-NEXT: mul a3, a3, a4 +; RV32M-NEXT: xori a4, a1, 1 +; RV32M-NEXT: li a5, 1463 +; RV32M-NEXT: mul a3, a3, a5 ; RV32M-NEXT: addi a3, a3, -1463 ; RV32M-NEXT: andi a3, a3, 2047 ; RV32M-NEXT: sltiu a3, a3, 293 ; RV32M-NEXT: addi a2, a2, -1 -; RV32M-NEXT: addi a3, a3, -1 ; RV32M-NEXT: addi a1, a1, -1 -; RV32M-NEXT: slli a4, a1, 21 -; RV32M-NEXT: srli a4, a4, 31 +; RV32M-NEXT: addi a3, a3, -1 ; RV32M-NEXT: andi a2, a2, 2047 ; RV32M-NEXT: andi a3, a3, 2047 ; RV32M-NEXT: slli a3, a3, 11 diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll index 5ae5caf3e88b2..537e05310dbea 100644 --- a/llvm/test/CodeGen/X86/scmp.ll +++ b/llvm/test/CodeGen/X86/scmp.ll @@ -1763,7 +1763,7 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE2-NEXT: pushq %r13 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movq %rdi, %r14 +; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp @@ -1779,11 +1779,11 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE2-NEXT: setl %sil ; SSE2-NEXT: setg %dil ; SSE2-NEXT: subb %sil, %dil -; SSE2-NEXT: movsbq %dil, %rax -; SSE2-NEXT: movq %rax, (%r14) -; SSE2-NEXT: movq %rax, %rsi -; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movsbq %dil, %rdi +; SSE2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movq %rdi, (%rax) +; SSE2-NEXT: sarq $63, %rdi +; SSE2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: addb %r11b, %r11b ; SSE2-NEXT: sarb %r11b ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %esi @@ -1793,9 +1793,9 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE2-NEXT: setl %sil ; SSE2-NEXT: setg %r11b ; SSE2-NEXT: subb %sil, %r11b -; SSE2-NEXT: movsbq %r11b, %rdi -; SSE2-NEXT: movq %rdi, %r11 -; SSE2-NEXT: sarq $63, %r11 +; SSE2-NEXT: movsbq %r11b, %r11 +; SSE2-NEXT: movq %r11, %r14 +; SSE2-NEXT: sarq $63, %r14 ; SSE2-NEXT: addb %r12b, %r12b ; SSE2-NEXT: sarb %r12b ; SSE2-NEXT: addb %dl, %dl @@ -1804,18 +1804,18 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE2-NEXT: setl %dl ; SSE2-NEXT: setg %sil ; SSE2-NEXT: subb %dl, %sil -; SSE2-NEXT: movsbq %sil, %rdx -; SSE2-NEXT: movq %rdx, %r13 -; SSE2-NEXT: sarq $63, %r13 +; SSE2-NEXT: movsbq %sil, %r13 +; SSE2-NEXT: movq %r13, %rdi +; SSE2-NEXT: sarq $63, %rdi ; SSE2-NEXT: addb %r15b, %r15b ; SSE2-NEXT: sarb %r15b ; SSE2-NEXT: addb %cl, %cl ; SSE2-NEXT: sarb %cl ; SSE2-NEXT: cmpb %r15b, %cl ; SSE2-NEXT: setl %cl -; SSE2-NEXT: setg %sil -; SSE2-NEXT: subb %cl, %sil -; SSE2-NEXT: movsbq %sil, %r15 +; SSE2-NEXT: setg %dl +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: movsbq %dl, %r15 ; SSE2-NEXT: movq %r15, %rcx ; SSE2-NEXT: sarq $63, %rcx ; SSE2-NEXT: addb %bpl, %bpl @@ -1823,9 +1823,9 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE2-NEXT: addb %r8b, %r8b ; SSE2-NEXT: sarb %r8b ; SSE2-NEXT: cmpb %bpl, %r8b -; SSE2-NEXT: setl %sil +; SSE2-NEXT: setl %dl ; SSE2-NEXT: setg %r8b -; SSE2-NEXT: subb %sil, %r8b +; SSE2-NEXT: subb %dl, %r8b ; SSE2-NEXT: movsbq %r8b, %r8 ; SSE2-NEXT: movq %r8, %r12 ; SSE2-NEXT: sarq $63, %r12 @@ -1834,85 +1834,83 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE2-NEXT: addb %r9b, %r9b ; SSE2-NEXT: sarb %r9b ; SSE2-NEXT: cmpb %bl, %r9b -; SSE2-NEXT: setl %sil +; SSE2-NEXT: setl %dl ; SSE2-NEXT: setg %r9b -; SSE2-NEXT: subb %sil, %r9b -; SSE2-NEXT: movsbq %r9b, %r9 -; SSE2-NEXT: movq %r9, %rbx -; SSE2-NEXT: sarq $63, %rbx +; SSE2-NEXT: subb %dl, %r9b +; SSE2-NEXT: movsbq %r9b, %rsi +; SSE2-NEXT: movq %rsi, %r9 +; SSE2-NEXT: sarq $63, %r9 ; SSE2-NEXT: addb %r10b, %r10b ; SSE2-NEXT: sarb %r10b -; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: addb %sil, %sil -; SSE2-NEXT: sarb %sil -; SSE2-NEXT: cmpb %r10b, %sil -; SSE2-NEXT: setl %sil +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: addb %dl, %dl +; SSE2-NEXT: sarb %dl +; SSE2-NEXT: cmpb %r10b, %dl +; SSE2-NEXT: setl %dl ; SSE2-NEXT: setg %r10b -; SSE2-NEXT: subb %sil, %r10b -; SSE2-NEXT: movsbq %r10b, %rbp -; SSE2-NEXT: movq %rbp, %r10 -; SSE2-NEXT: sarq $63, %r10 -; SSE2-NEXT: movq %r10, %rsi -; SSE2-NEXT: shldq $62, %rbp, %rsi -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: movq %rsi, 88(%r14) -; SSE2-NEXT: shrq $2, %r10 -; SSE2-NEXT: movl %r10d, 96(%r14) -; SSE2-NEXT: movq %rbx, %rsi -; SSE2-NEXT: shldq $20, %r9, %rsi -; SSE2-NEXT: movq %rsi, 64(%r14) -; SSE2-NEXT: movq %r12, %rsi -; SSE2-NEXT: shldq $31, %r8, %rsi -; SSE2-NEXT: movq %rsi, 48(%r14) -; SSE2-NEXT: movq %rcx, %rsi -; SSE2-NEXT: shldq $42, %r15, %rsi -; SSE2-NEXT: movabsq $9007199254738944, %rax # imm = 0x1FFFFFFFFFF800 -; SSE2-NEXT: andq %r13, %rax -; SSE2-NEXT: shldq $53, %rdx, %r13 -; SSE2-NEXT: movq %rsi, 32(%r14) -; SSE2-NEXT: movq %r13, 16(%r14) -; SSE2-NEXT: movabsq $9007199254740991, %rsi # imm = 0x1FFFFFFFFFFFFF -; SSE2-NEXT: andq %rsi, %r11 -; SSE2-NEXT: shldq $9, %rdi, %r11 -; SSE2-NEXT: shlq $62, %rbp -; SSE2-NEXT: orq %r11, %rbp -; SSE2-NEXT: movq %rbp, 80(%r14) -; SSE2-NEXT: movabsq $2251799813685247, %r11 # imm = 0x7FFFFFFFFFFFF -; SSE2-NEXT: andq %r10, %r11 -; SSE2-NEXT: movq %r11, %r10 -; SSE2-NEXT: shrq $48, %r10 -; SSE2-NEXT: movb %r10b, 102(%r14) -; SSE2-NEXT: shrq $32, %r11 -; SSE2-NEXT: movw %r11w, 100(%r14) +; SSE2-NEXT: subb %dl, %r10b +; SSE2-NEXT: movsbq %r10b, %r10 +; SSE2-NEXT: movq %r10, %rdx +; SSE2-NEXT: sarq $63, %rdx +; SSE2-NEXT: movl %edx, 96(%rax) +; SSE2-NEXT: movabsq $2251799813685247, %rbp # imm = 0x7FFFFFFFFFFFF +; SSE2-NEXT: andq %rdx, %rbp +; SSE2-NEXT: shldq $62, %r10, %rdx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero +; SSE2-NEXT: movq %r9, %rbx +; SSE2-NEXT: shldq $20, %rsi, %rbx +; SSE2-NEXT: movq %rdx, 88(%rax) +; SSE2-NEXT: movq %r12, %rdx +; SSE2-NEXT: shldq $31, %r8, %rdx +; SSE2-NEXT: movq %rbx, 64(%rax) +; SSE2-NEXT: movq %rcx, %rbx +; SSE2-NEXT: shldq $42, %r15, %rbx +; SSE2-NEXT: movq %rdx, 48(%rax) +; SSE2-NEXT: movq %rbx, 32(%rax) +; SSE2-NEXT: movabsq $9007199254738944, %rbx # imm = 0x1FFFFFFFFFF800 +; SSE2-NEXT: andq %rdi, %rbx +; SSE2-NEXT: shldq $53, %r13, %rdi +; SSE2-NEXT: movq %rdi, 16(%rax) +; SSE2-NEXT: movq %rbp, %rdx +; SSE2-NEXT: shrq $48, %rdx +; SSE2-NEXT: movb %dl, 102(%rax) +; SSE2-NEXT: shrq $32, %rbp +; SSE2-NEXT: movabsq $9007199254740991, %rdx # imm = 0x1FFFFFFFFFFFFF +; SSE2-NEXT: andq %rdx, %r14 +; SSE2-NEXT: shldq $9, %r11, %r14 +; SSE2-NEXT: movw %bp, 100(%rax) +; SSE2-NEXT: shlq $62, %r10 +; SSE2-NEXT: orq %r14, %r10 +; SSE2-NEXT: movq %r10, 80(%rax) ; SSE2-NEXT: shlq $42, %r15 -; SSE2-NEXT: shrq $11, %rax -; SSE2-NEXT: orq %r15, %rax -; SSE2-NEXT: movq %rax, 24(%r14) -; SSE2-NEXT: shlq $9, %rdi -; SSE2-NEXT: shrq $44, %rbx -; SSE2-NEXT: andl $511, %ebx # imm = 0x1FF -; SSE2-NEXT: orq %rdi, %rbx -; SSE2-NEXT: movq %rbx, 72(%r14) -; SSE2-NEXT: shlq $20, %r9 +; SSE2-NEXT: shrq $11, %rbx +; SSE2-NEXT: orq %r15, %rbx +; SSE2-NEXT: movq %rbx, 24(%rax) +; SSE2-NEXT: shlq $9, %r11 +; SSE2-NEXT: shrq $44, %r9 +; SSE2-NEXT: andl $511, %r9d # imm = 0x1FF +; SSE2-NEXT: orq %r11, %r9 +; SSE2-NEXT: movq %r9, 72(%rax) +; SSE2-NEXT: shlq $20, %rsi ; SSE2-NEXT: shrq $33, %r12 ; SSE2-NEXT: andl $1048575, %r12d # imm = 0xFFFFF -; SSE2-NEXT: orq %r9, %r12 -; SSE2-NEXT: movq %r12, 56(%r14) +; SSE2-NEXT: orq %rsi, %r12 +; SSE2-NEXT: movq %r12, 56(%rax) ; SSE2-NEXT: shlq $31, %r8 ; SSE2-NEXT: shrq $22, %rcx ; SSE2-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF ; SSE2-NEXT: orq %r8, %rcx -; SSE2-NEXT: movq %rcx, 40(%r14) +; SSE2-NEXT: movq %rcx, 40(%rax) ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload ; SSE2-NEXT: # xmm1 = mem[0],zero ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: andq %rsi, %rax -; SSE2-NEXT: shlq $53, %rdx -; SSE2-NEXT: orq %rax, %rdx -; SSE2-NEXT: movq %rdx, 8(%r14) -; SSE2-NEXT: movq %r14, %rax +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: andq %rdx, %rcx +; SSE2-NEXT: shlq $53, %r13 +; SSE2-NEXT: orq %rcx, %r13 +; SSE2-NEXT: movq %r13, 8(%rax) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -1929,151 +1927,148 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE4-NEXT: pushq %r13 ; SSE4-NEXT: pushq %r12 ; SSE4-NEXT: pushq %rbx -; SSE4-NEXT: movq %rdi, %r14 +; SSE4-NEXT: movq %rdi, %rbx +; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d -; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; SSE4-NEXT: addb %dil, %dil -; SSE4-NEXT: sarb %dil +; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; SSE4-NEXT: addb %r14b, %r14b +; SSE4-NEXT: sarb %r14b ; SSE4-NEXT: addb %sil, %sil ; SSE4-NEXT: sarb %sil -; SSE4-NEXT: cmpb %dil, %sil +; SSE4-NEXT: cmpb %r14b, %sil ; SSE4-NEXT: setl %sil -; SSE4-NEXT: setg %dil -; SSE4-NEXT: subb %sil, %dil -; SSE4-NEXT: movsbq %dil, %r12 -; SSE4-NEXT: movq %r12, %rdi -; SSE4-NEXT: sarq $63, %rdi -; SSE4-NEXT: addb %r10b, %r10b -; SSE4-NEXT: sarb %r10b +; SSE4-NEXT: setg %r14b +; SSE4-NEXT: subb %sil, %r14b +; SSE4-NEXT: movsbq %r14b, %r14 +; SSE4-NEXT: movq %r14, (%rbx) +; SSE4-NEXT: sarq $63, %r14 +; SSE4-NEXT: addb %r15b, %r15b +; SSE4-NEXT: sarb %r15b ; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %esi ; SSE4-NEXT: addb %sil, %sil ; SSE4-NEXT: sarb %sil -; SSE4-NEXT: cmpb %r10b, %sil +; SSE4-NEXT: cmpb %r15b, %sil ; SSE4-NEXT: setl %sil -; SSE4-NEXT: setg %r10b -; SSE4-NEXT: subb %sil, %r10b -; SSE4-NEXT: movsbq %r10b, %r10 -; SSE4-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE4-NEXT: sarq $63, %r10 -; SSE4-NEXT: addb %r11b, %r11b -; SSE4-NEXT: sarb %r11b +; SSE4-NEXT: setg %r15b +; SSE4-NEXT: subb %sil, %r15b +; SSE4-NEXT: movsbq %r15b, %r15 +; SSE4-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE4-NEXT: sarq $63, %r15 +; SSE4-NEXT: addb %bpl, %bpl +; SSE4-NEXT: sarb %bpl ; SSE4-NEXT: addb %dl, %dl ; SSE4-NEXT: sarb %dl -; SSE4-NEXT: cmpb %r11b, %dl +; SSE4-NEXT: cmpb %bpl, %dl ; SSE4-NEXT: setl %dl -; SSE4-NEXT: setg %r11b -; SSE4-NEXT: subb %dl, %r11b -; SSE4-NEXT: movsbq %r11b, %r11 -; SSE4-NEXT: movq %r11, %rsi -; SSE4-NEXT: sarq $63, %rsi -; SSE4-NEXT: addb %bl, %bl -; SSE4-NEXT: sarb %bl +; SSE4-NEXT: setg %bpl +; SSE4-NEXT: subb %dl, %bpl +; SSE4-NEXT: movsbq %bpl, %r12 +; SSE4-NEXT: movq %r12, %r13 +; SSE4-NEXT: sarq $63, %r13 +; SSE4-NEXT: addb %al, %al +; SSE4-NEXT: sarb %al ; SSE4-NEXT: addb %cl, %cl ; SSE4-NEXT: sarb %cl -; SSE4-NEXT: cmpb %bl, %cl +; SSE4-NEXT: cmpb %al, %cl ; SSE4-NEXT: setl %cl ; SSE4-NEXT: setg %dl ; SSE4-NEXT: subb %cl, %dl -; SSE4-NEXT: movsbq %dl, %rbx -; SSE4-NEXT: movq %rbx, %rcx +; SSE4-NEXT: movsbq %dl, %rsi +; SSE4-NEXT: movq %rsi, %rcx ; SSE4-NEXT: sarq $63, %rcx -; SSE4-NEXT: addb %r13b, %r13b -; SSE4-NEXT: sarb %r13b +; SSE4-NEXT: addb %r11b, %r11b +; SSE4-NEXT: sarb %r11b ; SSE4-NEXT: addb %r8b, %r8b ; SSE4-NEXT: sarb %r8b -; SSE4-NEXT: cmpb %r13b, %r8b +; SSE4-NEXT: cmpb %r11b, %r8b ; SSE4-NEXT: setl %dl ; SSE4-NEXT: setg %r8b ; SSE4-NEXT: subb %dl, %r8b ; SSE4-NEXT: movsbq %r8b, %rdx ; SSE4-NEXT: movq %rdx, %r8 ; SSE4-NEXT: sarq $63, %r8 -; SSE4-NEXT: addb %r15b, %r15b -; SSE4-NEXT: sarb %r15b +; SSE4-NEXT: addb %r10b, %r10b +; SSE4-NEXT: sarb %r10b ; SSE4-NEXT: addb %r9b, %r9b ; SSE4-NEXT: sarb %r9b -; SSE4-NEXT: cmpb %r15b, %r9b +; SSE4-NEXT: cmpb %r10b, %r9b ; SSE4-NEXT: setl %r9b -; SSE4-NEXT: setg %r15b -; SSE4-NEXT: subb %r9b, %r15b -; SSE4-NEXT: movsbq %r15b, %r9 -; SSE4-NEXT: movq %r9, %r15 -; SSE4-NEXT: sarq $63, %r15 -; SSE4-NEXT: addb %bpl, %bpl -; SSE4-NEXT: sarb %bpl -; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d -; SSE4-NEXT: addb %r13b, %r13b -; SSE4-NEXT: sarb %r13b -; SSE4-NEXT: cmpb %bpl, %r13b -; SSE4-NEXT: setl %bpl -; SSE4-NEXT: setg %r13b -; SSE4-NEXT: subb %bpl, %r13b -; SSE4-NEXT: movsbq %r13b, %rbp +; SSE4-NEXT: setg %r10b +; SSE4-NEXT: subb %r9b, %r10b +; SSE4-NEXT: movsbq %r10b, %r9 +; SSE4-NEXT: movq %r9, %r10 +; SSE4-NEXT: sarq $63, %r10 +; SSE4-NEXT: addb %dil, %dil +; SSE4-NEXT: sarb %dil +; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; SSE4-NEXT: addb %r11b, %r11b +; SSE4-NEXT: sarb %r11b +; SSE4-NEXT: cmpb %dil, %r11b +; SSE4-NEXT: setl %dil +; SSE4-NEXT: setg %r11b +; SSE4-NEXT: subb %dil, %r11b +; SSE4-NEXT: movsbq %r11b, %rdi +; SSE4-NEXT: movq %rdi, %rbp +; SSE4-NEXT: sarq $63, %rbp +; SSE4-NEXT: movl %ebp, 96(%rbx) ; SSE4-NEXT: movq %rbp, %rax -; SSE4-NEXT: sarq $63, %rax -; SSE4-NEXT: movq %rax, %r13 -; SSE4-NEXT: shldq $62, %rbp, %r13 -; SSE4-NEXT: movq %r12, (%r14) -; SSE4-NEXT: movq %r13, 88(%r14) -; SSE4-NEXT: shrq $2, %rax -; SSE4-NEXT: movl %eax, 96(%r14) -; SSE4-NEXT: movq %r15, %r12 -; SSE4-NEXT: shldq $20, %r9, %r12 -; SSE4-NEXT: movq %r12, 64(%r14) -; SSE4-NEXT: movq %r8, %r12 -; SSE4-NEXT: shldq $31, %rdx, %r12 -; SSE4-NEXT: movq %r12, 48(%r14) -; SSE4-NEXT: movq %rcx, %r12 -; SSE4-NEXT: shldq $42, %rbx, %r12 -; SSE4-NEXT: movabsq $9007199254738944, %r13 # imm = 0x1FFFFFFFFFF800 -; SSE4-NEXT: andq %rsi, %r13 -; SSE4-NEXT: shldq $53, %r11, %rsi -; SSE4-NEXT: movq %r12, 32(%r14) -; SSE4-NEXT: movq %rsi, 16(%r14) -; SSE4-NEXT: movabsq $9007199254740991, %rsi # imm = 0x1FFFFFFFFFFFFF -; SSE4-NEXT: andq %rsi, %r10 -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE4-NEXT: shldq $9, %r12, %r10 -; SSE4-NEXT: shlq $62, %rbp -; SSE4-NEXT: orq %r10, %rbp -; SSE4-NEXT: movq %rbp, 80(%r14) -; SSE4-NEXT: andq %rsi, %rdi -; SSE4-NEXT: shlq $53, %r11 -; SSE4-NEXT: orq %rdi, %r11 -; SSE4-NEXT: movq %r11, 8(%r14) -; SSE4-NEXT: movabsq $2251799813685247, %rsi # imm = 0x7FFFFFFFFFFFF -; SSE4-NEXT: andq %rax, %rsi -; SSE4-NEXT: movq %rsi, %rax -; SSE4-NEXT: shrq $48, %rax -; SSE4-NEXT: movb %al, 102(%r14) -; SSE4-NEXT: shrq $32, %rsi -; SSE4-NEXT: movw %si, 100(%r14) -; SSE4-NEXT: shlq $42, %rbx -; SSE4-NEXT: shrq $11, %r13 -; SSE4-NEXT: orq %rbx, %r13 -; SSE4-NEXT: movq %r13, 24(%r14) -; SSE4-NEXT: movq %r12, %rax -; SSE4-NEXT: shlq $9, %rax -; SSE4-NEXT: shrq $44, %r15 -; SSE4-NEXT: andl $511, %r15d # imm = 0x1FF -; SSE4-NEXT: orq %rax, %r15 -; SSE4-NEXT: movq %r15, 72(%r14) +; SSE4-NEXT: shldq $62, %rdi, %rax +; SSE4-NEXT: movabsq $2251799813685247, %r11 # imm = 0x7FFFFFFFFFFFF +; SSE4-NEXT: andq %rbp, %r11 +; SSE4-NEXT: movq %r10, %rbp +; SSE4-NEXT: shldq $20, %r9, %rbp +; SSE4-NEXT: movq %rax, 88(%rbx) +; SSE4-NEXT: movq %r8, %rax +; SSE4-NEXT: shldq $31, %rdx, %rax +; SSE4-NEXT: movq %rbp, 64(%rbx) +; SSE4-NEXT: movq %rcx, %rbp +; SSE4-NEXT: shldq $42, %rsi, %rbp +; SSE4-NEXT: movq %rax, 48(%rbx) +; SSE4-NEXT: movq %rbp, 32(%rbx) +; SSE4-NEXT: movabsq $9007199254738944, %rax # imm = 0x1FFFFFFFFFF800 +; SSE4-NEXT: andq %r13, %rax +; SSE4-NEXT: shldq $53, %r12, %r13 +; SSE4-NEXT: movq %r13, 16(%rbx) +; SSE4-NEXT: movq %r11, %r13 +; SSE4-NEXT: shrq $48, %r13 +; SSE4-NEXT: movb %r13b, 102(%rbx) +; SSE4-NEXT: shrq $32, %r11 +; SSE4-NEXT: movabsq $9007199254740991, %r13 # imm = 0x1FFFFFFFFFFFFF +; SSE4-NEXT: andq %r13, %r15 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; SSE4-NEXT: shldq $9, %rbp, %r15 +; SSE4-NEXT: movw %r11w, 100(%rbx) +; SSE4-NEXT: shlq $62, %rdi +; SSE4-NEXT: orq %r15, %rdi +; SSE4-NEXT: movq %rdi, 80(%rbx) +; SSE4-NEXT: andq %r13, %r14 +; SSE4-NEXT: shlq $53, %r12 +; SSE4-NEXT: orq %r14, %r12 +; SSE4-NEXT: movq %r12, 8(%rbx) +; SSE4-NEXT: shlq $42, %rsi +; SSE4-NEXT: shrq $11, %rax +; SSE4-NEXT: orq %rsi, %rax +; SSE4-NEXT: movq %rax, 24(%rbx) +; SSE4-NEXT: shlq $9, %rbp +; SSE4-NEXT: shrq $44, %r10 +; SSE4-NEXT: andl $511, %r10d # imm = 0x1FF +; SSE4-NEXT: orq %rbp, %r10 +; SSE4-NEXT: movq %r10, 72(%rbx) ; SSE4-NEXT: shlq $20, %r9 ; SSE4-NEXT: shrq $33, %r8 ; SSE4-NEXT: andl $1048575, %r8d # imm = 0xFFFFF ; SSE4-NEXT: orq %r9, %r8 -; SSE4-NEXT: movq %r8, 56(%r14) +; SSE4-NEXT: movq %r8, 56(%rbx) ; SSE4-NEXT: shlq $31, %rdx ; SSE4-NEXT: shrq $22, %rcx ; SSE4-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF ; SSE4-NEXT: orq %rdx, %rcx -; SSE4-NEXT: movq %rcx, 40(%r14) -; SSE4-NEXT: movq %r14, %rax +; SSE4-NEXT: movq %rcx, 40(%rbx) +; SSE4-NEXT: movq %rbx, %rax ; SSE4-NEXT: popq %rbx ; SSE4-NEXT: popq %r12 ; SSE4-NEXT: popq %r13 @@ -2174,14 +2169,14 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; AVX-NEXT: setl %dil ; AVX-NEXT: setg %r11b ; AVX-NEXT: subb %dil, %r11b -; AVX-NEXT: movsbq %r11b, %r11 -; AVX-NEXT: movq %r11, %rdi -; AVX-NEXT: sarq $63, %rdi +; AVX-NEXT: movsbq %r11b, %rdi ; AVX-NEXT: movq %rdi, %rbp -; AVX-NEXT: shldq $62, %r11, %rbp +; AVX-NEXT: sarq $63, %rbp +; AVX-NEXT: movl %ebp, 96(%rax) +; AVX-NEXT: movb $51, %r11b +; AVX-NEXT: bzhiq %r11, %rbp, %r11 +; AVX-NEXT: shldq $62, %rdi, %rbp ; AVX-NEXT: movq %rbp, 88(%rax) -; AVX-NEXT: shrq $2, %rdi -; AVX-NEXT: movl %edi, 96(%rax) ; AVX-NEXT: movq %r10, %rbp ; AVX-NEXT: shldq $20, %r9, %rbp ; AVX-NEXT: movq %rbp, 64(%rax) @@ -2195,23 +2190,21 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; AVX-NEXT: bzhiq %rbp, %r13, %rbp ; AVX-NEXT: shldq $53, %r12, %r13 ; AVX-NEXT: movq %r13, 16(%rax) -; AVX-NEXT: movb $53, %r13b -; AVX-NEXT: bzhiq %r13, %r15, %r15 +; AVX-NEXT: movq %r11, %r13 +; AVX-NEXT: shrq $48, %r13 +; AVX-NEXT: movb %r13b, 102(%rax) +; AVX-NEXT: shrq $32, %r11 +; AVX-NEXT: movw %r11w, 100(%rax) +; AVX-NEXT: movb $53, %r11b +; AVX-NEXT: bzhiq %r11, %r15, %r15 ; AVX-NEXT: shldq $9, %rsi, %r15 -; AVX-NEXT: shlq $62, %r11 -; AVX-NEXT: orq %r15, %r11 -; AVX-NEXT: movq %r11, 80(%rax) -; AVX-NEXT: bzhiq %r13, %r14, %r11 +; AVX-NEXT: shlq $62, %rdi +; AVX-NEXT: orq %r15, %rdi +; AVX-NEXT: movq %rdi, 80(%rax) +; AVX-NEXT: bzhiq %r11, %r14, %rdi ; AVX-NEXT: shlq $53, %r12 -; AVX-NEXT: orq %r11, %r12 +; AVX-NEXT: orq %rdi, %r12 ; AVX-NEXT: movq %r12, 8(%rax) -; AVX-NEXT: movb $51, %r11b -; AVX-NEXT: bzhiq %r11, %rdi, %rdi -; AVX-NEXT: movq %rdi, %r11 -; AVX-NEXT: shrq $48, %r11 -; AVX-NEXT: movb %r11b, 102(%rax) -; AVX-NEXT: shrq $32, %rdi -; AVX-NEXT: movw %di, 100(%rax) ; AVX-NEXT: shlq $42, %rbx ; AVX-NEXT: shrq $11, %rbp ; AVX-NEXT: orq %rbx, %rbp @@ -2270,24 +2263,24 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; X86-NEXT: addb %al, %al ; X86-NEXT: sarb %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: addb %dl, %dl -; X86-NEXT: sarb %dl -; X86-NEXT: movb {{[0-9]+}}(%esp), %ah -; X86-NEXT: addb %ah, %ah -; X86-NEXT: sarb %ah -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addb %cl, %cl -; X86-NEXT: sarb %cl -; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: addb %ch, %ch -; X86-NEXT: sarb %ch ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: addb %bl, %bl ; X86-NEXT: sarb %bl ; X86-NEXT: movb {{[0-9]+}}(%esp), %bh ; X86-NEXT: addb %bh, %bh ; X86-NEXT: sarb %bh +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: addb %dl, %dl +; X86-NEXT: sarb %dl +; X86-NEXT: movb {{[0-9]+}}(%esp), %ch +; X86-NEXT: addb %ch, %ch +; X86-NEXT: sarb %ch +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: addb %cl, %cl +; X86-NEXT: sarb %cl +; X86-NEXT: movb {{[0-9]+}}(%esp), %ah +; X86-NEXT: addb %ah, %ah +; X86-NEXT: sarb %ah ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: addb %al, %al ; X86-NEXT: sarb %al @@ -2304,140 +2297,136 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: andl $2097151, %esi # imm = 0x1FFFFF ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpb %bl, %bh +; X86-NEXT: cmpb %cl, %ah ; X86-NEXT: setl %al -; X86-NEXT: setg %dh -; X86-NEXT: subb %al, %dh -; X86-NEXT: movsbl %dh, %esi +; X86-NEXT: setg %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movsbl %cl, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %esi, %ebp ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $2097151, %esi # imm = 0x1FFFFF -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpb %cl, %ch +; X86-NEXT: andl $2097151, %eax # imm = 0x1FFFFF +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpb %dl, %ch ; X86-NEXT: setl %al ; X86-NEXT: setg %cl ; X86-NEXT: subb %al, %cl -; X86-NEXT: movsbl %cl, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ecx, (%ebp) -; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl $2097151, %esi # imm = 0x1FFFFF -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpb %dl, %ah -; X86-NEXT: setl %al +; X86-NEXT: movsbl %cl, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $2097151, %ecx # imm = 0x1FFFFF +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpb %bl, %bh +; X86-NEXT: setl %cl ; X86-NEXT: setg %dl -; X86-NEXT: subb %al, %dl +; X86-NEXT: subb %cl, %dl ; X86-NEXT: movsbl %dl, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sarl $31, %edi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload -; X86-NEXT: setl %al -; X86-NEXT: setg %dl -; X86-NEXT: subb %al, %dl -; X86-NEXT: movsbl %dl, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload -; X86-NEXT: setl %al +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload +; X86-NEXT: setl %cl ; X86-NEXT: setg %dl -; X86-NEXT: subb %al, %dl -; X86-NEXT: movsbl %dl, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sarl $31, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload -; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload -; X86-NEXT: setl %dl -; X86-NEXT: setg %dh -; X86-NEXT: subb %dl, %dh -; X86-NEXT: movsbl %dh, %edx +; X86-NEXT: subb %cl, %dl +; X86-NEXT: movsbl %dl, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl %edx, 96(%ebp) -; X86-NEXT: movl %edx, 92(%ebp) -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, 80(%ebp) -; X86-NEXT: movl %eax, 68(%ebp) -; X86-NEXT: movl %eax, 64(%ebp) -; X86-NEXT: movl %esi, 52(%ebp) -; X86-NEXT: movl %esi, 48(%ebp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, 36(%ebp) -; X86-NEXT: movl %edi, 24(%ebp) -; X86-NEXT: movl %edi, 20(%ebp) -; X86-NEXT: movl %ecx, 8(%ebp) -; X86-NEXT: movl %ecx, 4(%ebp) -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $30, %edx, %ecx -; X86-NEXT: movl %ecx, 88(%ebp) -; X86-NEXT: movl %ebp, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $9, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload +; X86-NEXT: setl %cl +; X86-NEXT: setg %ch +; X86-NEXT: subb %cl, %ch +; X86-NEXT: movsbl %ch, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload +; X86-NEXT: setl %cl +; X86-NEXT: setg %ch +; X86-NEXT: subb %cl, %ch +; X86-NEXT: movsbl %ch, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %esi, 96(%ecx) +; X86-NEXT: movl %esi, 92(%ecx) +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %ebp, 80(%ecx) +; X86-NEXT: movl %ebx, 68(%ecx) +; X86-NEXT: movl %ebx, 64(%ecx) +; X86-NEXT: movl %edx, 52(%ecx) +; X86-NEXT: movl %edx, 48(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $9, %ebp, %ecx -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: movl %ecx, 76(%ebx) -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl $20, %ebx, %ecx -; X86-NEXT: movl %ecx, 60(%ebp) -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl $31, %ebx, %ecx -; X86-NEXT: movl %ecx, 44(%ebp) -; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: movl %ebp, 36(%ecx) +; X86-NEXT: movl %edi, 24(%ecx) +; X86-NEXT: movl %edi, 20(%ecx) +; X86-NEXT: movl %eax, 8(%ecx) +; X86-NEXT: movl %eax, 4(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movw %ax, 100(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $10, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shldl $30, %ecx, %eax +; X86-NEXT: movl %eax, 88(%esi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $9, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: shldl $9, %ebp, %eax +; X86-NEXT: movl %eax, 76(%esi) +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $10, %ebp, %ecx -; X86-NEXT: movl %ecx, 32(%ebx) -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: shldl $20, %ebp, %eax +; X86-NEXT: movl %eax, 60(%esi) +; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: shldl $31, %ebp, %eax +; X86-NEXT: movl %eax, 44(%esi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $10, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $21, %ebp, %ecx -; X86-NEXT: movl %ecx, 16(%ebx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: shrl $2, %ecx -; X86-NEXT: movw %cx, 100(%ebx) +; X86-NEXT: shldl $10, %ebp, %eax +; X86-NEXT: movl %eax, 32(%esi) +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: shldl $21, %ebp, %eax +; X86-NEXT: movl %eax, 16(%esi) ; X86-NEXT: shll $21, %ebp ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, 12(%ebx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shll $30, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, 84(%ebx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shll $9, %ecx -; X86-NEXT: shrl $12, %eax -; X86-NEXT: andl $511, %eax # imm = 0x1FF -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl %eax, 72(%ebx) +; X86-NEXT: movl %ebp, 12(%esi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl $7, %eax +; X86-NEXT: movb %al, 102(%esi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shll $30, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, 84(%esi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shll $9, %eax +; X86-NEXT: shrl $12, %ebx +; X86-NEXT: andl $511, %ebx # imm = 0x1FF +; X86-NEXT: orl %eax, %ebx +; X86-NEXT: movl %ebx, 72(%esi) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shll $20, %eax -; X86-NEXT: shrl %esi -; X86-NEXT: andl $1048575, %esi # imm = 0xFFFFF -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %esi, 56(%ebx) +; X86-NEXT: shrl %edx +; X86-NEXT: andl $1048575, %edx # imm = 0xFFFFF +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, 56(%esi) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shll $31, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, 40(%ebx) +; X86-NEXT: movl %eax, 40(%esi) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shll $10, %eax ; X86-NEXT: shrl $11, %edi ; X86-NEXT: andl $1023, %edi # imm = 0x3FF ; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl %edi, 28(%ebx) -; X86-NEXT: movl %edx, %eax -; X86-NEXT: shrl $18, %eax -; X86-NEXT: andl $7, %eax -; X86-NEXT: movb %al, 102(%ebx) -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %edi, 28(%esi) +; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi