Skip to content

Commit e4a124d

Browse files
committed
[DAG] Fold (srl (shl x, c1), c2) -> and(shl/srl(x, c3), m)
Similar to the existing (shl (srl x, c1), c2) fold Part of the work to fix the regressions in D77804 Differential Revision: https://reviews.llvm.org/D125836
1 parent 26041e1 commit e4a124d

File tree

12 files changed

+226
-150
lines changed

12 files changed

+226
-150
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9419,15 +9419,41 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
94199419
}
94209420
}
94219421

9422-
// fold (srl (shl x, c), c) -> (and x, cst2)
9423-
// TODO - (srl (shl x, c1), c2).
9424-
if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
9425-
isConstantOrConstantVector(N1, /* NoOpaques */ true)) {
9426-
SDLoc DL(N);
9427-
SDValue Mask =
9428-
DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1);
9429-
AddToWorklist(Mask.getNode());
9430-
return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask);
9422+
// fold (srl (shl x, c1), c2) -> (and (shl x, (sub c1, c2), MASK) or
9423+
// (and (srl x, (sub c2, c1), MASK)
9424+
if (N0.getOpcode() == ISD::SHL &&
9425+
(N0.getOperand(1) == N1 || N0->hasOneUse()) &&
9426+
TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
9427+
auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS,
9428+
ConstantSDNode *RHS) {
9429+
const APInt &LHSC = LHS->getAPIntValue();
9430+
const APInt &RHSC = RHS->getAPIntValue();
9431+
return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
9432+
LHSC.getZExtValue() <= RHSC.getZExtValue();
9433+
};
9434+
if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
9435+
/*AllowUndefs*/ false,
9436+
/*AllowTypeMismatch*/ true)) {
9437+
SDLoc DL(N);
9438+
SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
9439+
SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
9440+
SDValue Mask = DAG.getAllOnesConstant(DL, VT);
9441+
Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N01);
9442+
Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, Diff);
9443+
SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
9444+
return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
9445+
}
9446+
if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
9447+
/*AllowUndefs*/ false,
9448+
/*AllowTypeMismatch*/ true)) {
9449+
SDLoc DL(N);
9450+
SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
9451+
SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
9452+
SDValue Mask = DAG.getAllOnesConstant(DL, VT);
9453+
Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N1);
9454+
SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff);
9455+
return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
9456+
}
94319457
}
94329458

94339459
// fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask)

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13411,7 +13411,18 @@ bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
1341113411
N->getOperand(0).getOpcode() == ISD::SHL)) &&
1341213412
"Expected shift-shift mask");
1341313413
// Don't allow multiuse shift folding with the same shift amount.
13414-
return N->getOperand(0)->hasOneUse();
13414+
if (!N->getOperand(0)->hasOneUse())
13415+
return false;
13416+
13417+
// Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
13418+
EVT VT = N->getValueType(0);
13419+
if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
13420+
auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13421+
auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13422+
return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
13423+
}
13424+
13425+
return true;
1341513426
}
1341613427

1341713428
bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5844,6 +5844,7 @@ bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
58445844
(N->getOpcode() == ISD::SRL &&
58455845
N->getOperand(0).getOpcode() == ISD::SHL)) &&
58465846
"Expected shift-shift mask");
5847+
// TODO: Should we always create i64 masks? Or only folded immediates?
58475848
EVT VT = N->getValueType(0);
58485849
if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
58495850
(Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {

llvm/test/CodeGen/AArch64/ushl_sat.ll

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -129,11 +129,10 @@ define i16 @combine_shlsat_to_shl_no_fold(i16 %x) nounwind {
129129
; CHECK-LABEL: combine_shlsat_to_shl_no_fold:
130130
; CHECK: // %bb.0:
131131
; CHECK-NEXT: and w8, w0, #0xfffc
132-
; CHECK-NEXT: lsl w9, w8, #14
133-
; CHECK-NEXT: lsl w8, w8, #17
134-
; CHECK-NEXT: and w10, w9, #0x1fff0000
135-
; CHECK-NEXT: cmp w9, w10
136-
; CHECK-NEXT: csinv w8, w8, wzr, eq
132+
; CHECK-NEXT: lsl w9, w8, #17
133+
; CHECK-NEXT: lsl w8, w8, #14
134+
; CHECK-NEXT: cmp w8, w9, lsr #3
135+
; CHECK-NEXT: csinv w8, w9, wzr, eq
137136
; CHECK-NEXT: lsr w0, w8, #16
138137
; CHECK-NEXT: ret
139138
%x2 = lshr i16 %x, 2

llvm/test/CodeGen/AMDGPU/idot8s.ll

Lines changed: 45 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2852,7 +2852,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
28522852
; GFX7-NEXT: v_bfe_i32 v8, v2, 8, 4
28532853
; GFX7-NEXT: v_bfe_i32 v9, v2, 4, 4
28542854
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4
2855-
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v3
2855+
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3
28562856
; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4
28572857
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5
28582858
; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6
@@ -2861,67 +2861,67 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
28612861
; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9
28622862
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
28632863
; GFX7-NEXT: s_waitcnt vmcnt(1)
2864-
; GFX7-NEXT: v_ashrrev_i32_e32 v11, 28, v0
2865-
; GFX7-NEXT: v_bfe_i32 v12, v0, 24, 4
2866-
; GFX7-NEXT: v_bfe_i32 v13, v0, 20, 4
2867-
; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4
2868-
; GFX7-NEXT: v_bfe_i32 v15, v0, 12, 4
2869-
; GFX7-NEXT: v_bfe_i32 v16, v0, 8, 4
2870-
; GFX7-NEXT: v_bfe_i32 v17, v0, 4, 4
2864+
; GFX7-NEXT: v_ashrrev_i32_e32 v10, 28, v0
2865+
; GFX7-NEXT: v_bfe_i32 v11, v0, 24, 4
2866+
; GFX7-NEXT: v_bfe_i32 v12, v0, 20, 4
2867+
; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4
2868+
; GFX7-NEXT: v_bfe_i32 v14, v0, 12, 4
2869+
; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4
2870+
; GFX7-NEXT: v_bfe_i32 v16, v0, 4, 4
28712871
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4
2872-
; GFX7-NEXT: v_or_b32_e32 v4, v4, v10
2873-
; GFX7-NEXT: v_or_b32_e32 v5, v6, v5
2874-
; GFX7-NEXT: v_or_b32_e32 v6, v8, v7
2872+
; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
2873+
; GFX7-NEXT: v_or_b32_e32 v4, v6, v5
2874+
; GFX7-NEXT: v_or_b32_e32 v5, v8, v7
28752875
; GFX7-NEXT: v_or_b32_e32 v2, v2, v9
2876-
; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v11
2877-
; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v12
2878-
; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v13
2879-
; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v14
2880-
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v15
2881-
; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v16
2882-
; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v17
2876+
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v10
2877+
; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v11
2878+
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v12
2879+
; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v13
2880+
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v14
2881+
; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v15
2882+
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v16
28832883
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
2884-
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
2885-
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
2886-
; GFX7-NEXT: v_or_b32_e32 v7, v8, v7
2887-
; GFX7-NEXT: v_or_b32_e32 v8, v10, v9
2888-
; GFX7-NEXT: v_or_b32_e32 v9, v13, v12
2889-
; GFX7-NEXT: v_or_b32_e32 v0, v0, v14
2890-
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
2884+
; GFX7-NEXT: v_or_b32_e32 v6, v7, v6
2885+
; GFX7-NEXT: v_or_b32_e32 v7, v9, v8
2886+
; GFX7-NEXT: v_or_b32_e32 v8, v11, v10
2887+
; GFX7-NEXT: v_or_b32_e32 v0, v0, v12
2888+
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
28912889
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
2892-
; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
2893-
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7
2894-
; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v9
2890+
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
28952891
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
2896-
; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
2897-
; GFX7-NEXT: v_or_b32_e32 v0, v0, v7
2898-
; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v2
2892+
; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v3
2893+
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
2894+
; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
2895+
; GFX7-NEXT: v_or_b32_e32 v0, v0, v8
2896+
; GFX7-NEXT: v_or_b32_e32 v4, v4, v13
2897+
; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v2
28992898
; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v0
2900-
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v8
2901-
; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 8
2899+
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
2900+
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
2901+
; GFX7-NEXT: v_bfe_u32 v9, v2, 8, 8
29022902
; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 8
29032903
; GFX7-NEXT: s_waitcnt vmcnt(0)
2904-
; GFX7-NEXT: v_mad_u32_u24 v1, v7, v13, v1
2905-
; GFX7-NEXT: v_or_b32_e32 v5, v6, v5
2906-
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2
2904+
; GFX7-NEXT: v_mad_u32_u24 v1, v8, v13, v1
2905+
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
2906+
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2
29072907
; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
29082908
; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v0
29092909
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
2910-
; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1
2910+
; GFX7-NEXT: v_mad_u32_u24 v1, v9, v14, v1
29112911
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
2912-
; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v4
2912+
; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v4
29132913
; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v5
2914-
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v12, v0
2915-
; GFX7-NEXT: v_bfe_u32 v10, v4, 8, 8
2914+
; GFX7-NEXT: v_mad_u32_u24 v0, v7, v12, v0
2915+
; GFX7-NEXT: v_bfe_u32 v11, v4, 8, 8
29162916
; GFX7-NEXT: v_bfe_u32 v16, v5, 8, 8
2917-
; GFX7-NEXT: v_mad_u32_u24 v0, v9, v15, v0
2917+
; GFX7-NEXT: v_mad_u32_u24 v0, v10, v15, v0
29182918
; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8
29192919
; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8
2920-
; GFX7-NEXT: v_mad_u32_u24 v0, v10, v16, v0
2921-
; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3
2922-
; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11
2920+
; GFX7-NEXT: v_mad_u32_u24 v0, v11, v16, v0
2921+
; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8
2922+
; GFX7-NEXT: v_bfe_u32 v6, v6, 8, 8
29232923
; GFX7-NEXT: v_mad_u32_u24 v0, v4, v5, v0
2924-
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v11, v0
2924+
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0
29252925
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
29262926
; GFX7-NEXT: s_endpgm
29272927
;

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -683,8 +683,8 @@ define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(
683683
; SI-NEXT: s_mov_b32 s4, s0
684684
; SI-NEXT: s_mov_b32 s5, s1
685685
; SI-NEXT: s_waitcnt vmcnt(0)
686-
; SI-NEXT: v_lshlrev_b32_e32 v0, 31, v0
687-
; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
686+
; SI-NEXT: v_lshlrev_b32_e32 v0, 30, v0
687+
; SI-NEXT: v_and_b32_e32 v0, 2.0, v0
688688
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
689689
; SI-NEXT: s_endpgm
690690
;
@@ -702,8 +702,8 @@ define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(
702702
; VI-NEXT: s_mov_b32 s4, s0
703703
; VI-NEXT: s_mov_b32 s5, s1
704704
; VI-NEXT: s_waitcnt vmcnt(0)
705-
; VI-NEXT: v_lshlrev_b32_e32 v0, 31, v0
706-
; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
705+
; VI-NEXT: v_lshlrev_b32_e32 v0, 30, v0
706+
; VI-NEXT: v_and_b32_e32 v0, 2.0, v0
707707
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
708708
; VI-NEXT: s_endpgm
709709
%x = load i32, i32 addrspace(1)* %in, align 4

llvm/test/CodeGen/ARM/umulo-32.ll

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,23 +31,18 @@ define i32 @test2(i32* %m_degree) ssp {
3131
; CHECK-LABEL: test2:
3232
; CHECK: @ %bb.0:
3333
; CHECK-NEXT: push {r4, lr}
34-
; CHECK-NEXT: movs r1, #7
35-
; CHECK-NEXT: lsls r1, r1, #29
36-
; CHECK-NEXT: ldr r0, [r0]
37-
; CHECK-NEXT: mov r2, r0
38-
; CHECK-NEXT: bics r2, r1
39-
; CHECK-NEXT: subs r1, r0, r2
34+
; CHECK-NEXT: ldr r1, [r0]
35+
; CHECK-NEXT: lsls r0, r1, #3
36+
; CHECK-NEXT: lsrs r2, r0, #3
37+
; CHECK-NEXT: subs r1, r1, r2
4038
; CHECK-NEXT: subs r2, r1, #1
4139
; CHECK-NEXT: sbcs r1, r2
4240
; CHECK-NEXT: movs r4, #0
4341
; CHECK-NEXT: cmp r1, #0
44-
; CHECK-NEXT: bne .LBB1_2
42+
; CHECK-NEXT: beq .LBB1_2
4543
; CHECK-NEXT: @ %bb.1:
46-
; CHECK-NEXT: lsls r0, r0, #3
47-
; CHECK-NEXT: b .LBB1_3
48-
; CHECK-NEXT: .LBB1_2:
4944
; CHECK-NEXT: mvns r0, r4
50-
; CHECK-NEXT: .LBB1_3:
45+
; CHECK-NEXT: .LBB1_2:
5146
; CHECK-NEXT: bl _Znam
5247
; CHECK-NEXT: mov r0, r4
5348
; CHECK-NEXT: pop {r4, pc}

llvm/test/CodeGen/X86/pr32588.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,8 @@ define void @fn1() {
99
; CHECK-LABEL: fn1:
1010
; CHECK: # %bb.0:
1111
; CHECK-NEXT: xorl %eax, %eax
12-
; CHECK-NEXT: cmpl $1, c(%rip)
13-
; CHECK-NEXT: sbbl %eax, %eax
14-
; CHECK-NEXT: andl $1, %eax
12+
; CHECK-NEXT: cmpl $0, c(%rip)
13+
; CHECK-NEXT: sete %al
1514
; CHECK-NEXT: movl %eax, d(%rip)
1615
; CHECK-NEXT: retq
1716
%t0 = load i32, i32* @c, align 4

llvm/test/CodeGen/X86/pull-binop-through-shift.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -195,10 +195,9 @@ define i32 @and_signbit_lshr(i32 %x, i32* %dst) {
195195
;
196196
; X86-LABEL: and_signbit_lshr:
197197
; X86: # %bb.0:
198-
; X86-NEXT: movzwl 6(%esp), %eax
199-
; X86-NEXT: shll $16, %eax
200198
; X86-NEXT: movl 8(%esp), %ecx
201-
; X86-NEXT: shrl $8, %eax
199+
; X86-NEXT: movzwl 6(%esp), %eax
200+
; X86-NEXT: shll $8, %eax
202201
; X86-NEXT: movl %eax, (%ecx)
203202
; X86-NEXT: retl
204203
%t0 = and i32 %x, 4294901760 ; 0xFFFF0000

llvm/test/CodeGen/X86/rotate-extract-vector.ll

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -147,13 +147,19 @@ define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind {
147147

148148
; Result would undershift
149149
define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind {
150-
; CHECK-LABEL: no_extract_shl:
151-
; CHECK: # %bb.0:
152-
; CHECK-NEXT: vpsllq $11, %ymm0, %ymm1
153-
; CHECK-NEXT: vpsllq $24, %ymm0, %ymm0
154-
; CHECK-NEXT: vpsrlq $50, %ymm1, %ymm1
155-
; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0
156-
; CHECK-NEXT: ret{{[l|q]}}
150+
; X86-LABEL: no_extract_shl:
151+
; X86: # %bb.0:
152+
; X86-NEXT: vpsllq $24, %ymm0, %ymm1
153+
; X86-NEXT: vpsrlq $39, %ymm0, %ymm0
154+
; X86-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm0
155+
; X86-NEXT: retl
156+
;
157+
; X64-LABEL: no_extract_shl:
158+
; X64: # %bb.0:
159+
; X64-NEXT: vpsllq $24, %ymm0, %ymm1
160+
; X64-NEXT: vpsrlq $39, %ymm0, %ymm0
161+
; X64-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0
162+
; X64-NEXT: retq
157163
%lhs_mul = shl <4 x i64> %i, <i64 11, i64 11, i64 11, i64 11>
158164
%rhs_mul = shl <4 x i64> %i, <i64 24, i64 24, i64 24, i64 24>
159165
%lhs_shift = lshr <4 x i64> %lhs_mul, <i64 50, i64 50, i64 50, i64 50>

0 commit comments

Comments
 (0)