Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11281,6 +11281,11 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
unsigned BitWidth = VT.getScalarSizeInBits();
SDLoc DL(N);

// fold (fshl C0, C1, C2) -> C3
if (SDValue C =
DAG.FoldConstantArithmetic(N->getOpcode(), DL, VT, {N0, N1, N2}))
return C;

// fold (fshl N0, N1, 0) -> N0
// fold (fshr N0, N1, 0) -> N1
if (isPowerOf2_32(BitWidth))
Expand Down
28 changes: 28 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7175,6 +7175,28 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
}
}

// Handle fshl/fshr special cases.
if (Opcode == ISD::FSHL || Opcode == ISD::FSHR) {
auto *C1 = dyn_cast<ConstantSDNode>(Ops[0]);
auto *C2 = dyn_cast<ConstantSDNode>(Ops[1]);
auto *C3 = dyn_cast<ConstantSDNode>(Ops[2]);

if (C1 && C2 && C3) {
if (C1->isOpaque() || C2->isOpaque() || C3->isOpaque())
return SDValue();
const APInt V1 = C1->getAPIntValue(), V2 = C2->getAPIntValue(),
V3 = C3->getAPIntValue();

APInt FoldedVal = Opcode == ISD::FSHL ? APIntOps::fshl(V1, V2, V3)
: APIntOps::fshr(V1, V2, V3);

SDValue Folded = getConstant(FoldedVal, DL, VT);
assert((!Folded || !VT.isVector()) &&
"Can't fold vectors ops with scalar operands");
return Folded;
}
}

// This is for vector folding only from here on.
if (!VT.isVector())
return SDValue();
Expand Down Expand Up @@ -8158,6 +8180,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
}
break;
}
case ISD::FSHL:
case ISD::FSHR:
// Constant folding.
if (SDValue V = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2, N3}))
return V;
break;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better to do this in DAGCombiner::visitFunnelShift

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC, it doesn't work for constant vectors without the modification in getNode. The constant fold for vector depends on constant fold in getNode:

SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags);

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about adding it to the bottom of getNode before memoization - like we do for the 2-op getNode? Keep it in visitFunnelShift as well if you can.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. That sounds good to me.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, a quick try shows that some operators like VECTOR_COMPRESS cause the assertion in FoldConstantArithmetic. I will look into the reason later.

Copy link
Member Author

@XChy XChy Aug 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FoldConstantArithmetic would try folding non-arithmetic node vector_compress <1, 2>, <3, 4>, <5, 6> to a scalar version <(vector_compress 1, 3, 5), (vector_compress 2, 4, 6)>. That's illegal.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For other vector operators like VECTOR_SPLICE, VSELECT, it's illegal as well. Should we handle specific arithmetic operators at the bottom?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cheers - ideally we can get this deal with so we can pull out the constant folding ISD::FMA/FMAD as well.

Sorry for not noticing this response. What do you mean by "pull out the constant folding ISD::FMA/FMAD"? Is it moving the fold in getNode to FoldConstantArithmetic?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I don't want to derail this patch. So if you have to add back FSHL/R cases and constant fold there, then do so and we can look at this and FMA/FMAD later.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, though I have refactored and pulled out the FMA/FMAD fold yesterday. Feel free to review the current implementation if you have any problem.

case ISD::BUILD_VECTOR: {
// Attempt to simplify BUILD_VECTOR.
SDValue Ops[] = {N1, N2, N3};
Expand Down
149 changes: 149 additions & 0 deletions llvm/test/CodeGen/X86/fshl-fshr-constant.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,CHECK-EXPAND
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512vbmi2 | FileCheck %s --check-prefixes=CHECK,CHECK-UNEXPAND

define <4 x i32> @test_fshl_constants() {
; CHECK-EXPAND-LABEL: test_fshl_constants:
; CHECK-EXPAND: # %bb.0:
; CHECK-EXPAND-NEXT: vmovaps {{.*#+}} xmm0 = [0,512,2048,6144]
; CHECK-EXPAND-NEXT: retq
;
; CHECK-UNEXPAND-LABEL: test_fshl_constants:
; CHECK-UNEXPAND: # %bb.0:
; CHECK-UNEXPAND-NEXT: vpmovsxwd {{.*#+}} xmm0 = [0,512,2048,6144]
; CHECK-UNEXPAND-NEXT: retq
%res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> <i32 8, i32 9, i32 10, i32 11>)
ret <4 x i32> %res
}

define <4 x i32> @test_fshl_splat_constants() {
; CHECK-LABEL: test_fshl_splat_constants:
; CHECK: # %bb.0:
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [256,256,256,256]
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 4, i32 4, i32 4, i32 4>, <4 x i32> <i32 8, i32 8, i32 8, i32 8>)
ret <4 x i32> %res
}

define <4 x i32> @test_fshl_two_constants(<4 x i32> %a) {
; CHECK-EXPAND-LABEL: test_fshl_two_constants:
; CHECK-EXPAND: # %bb.0:
; CHECK-EXPAND-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-EXPAND-NEXT: retq
;
; CHECK-UNEXPAND-LABEL: test_fshl_two_constants:
; CHECK-UNEXPAND: # %bb.0:
; CHECK-UNEXPAND-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7]
; CHECK-UNEXPAND-NEXT: vpshldvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
; CHECK-UNEXPAND-NEXT: retq
%res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> <i32 8, i32 9, i32 10, i32 11>)
ret <4 x i32> %res
}

define <4 x i32> @test_fshl_one_constant(<4 x i32> %a, <4 x i32> %b) {
; CHECK-EXPAND-LABEL: test_fshl_one_constant:
; CHECK-EXPAND: # %bb.0:
; CHECK-EXPAND-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; CHECK-EXPAND-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-EXPAND-NEXT: vpor %xmm1, %xmm0, %xmm0
; CHECK-EXPAND-NEXT: retq
;
; CHECK-UNEXPAND-LABEL: test_fshl_one_constant:
; CHECK-UNEXPAND: # %bb.0:
; CHECK-UNEXPAND-NEXT: vpshldvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
; CHECK-UNEXPAND-NEXT: retq
%res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 8, i32 9, i32 10, i32 11>)
ret <4 x i32> %res
}

define <4 x i32> @test_fshl_none_constant(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; CHECK-EXPAND-LABEL: test_fshl_none_constant:
; CHECK-EXPAND: # %bb.0:
; CHECK-EXPAND-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
; CHECK-EXPAND-NEXT: vpandn %xmm3, %xmm2, %xmm4
; CHECK-EXPAND-NEXT: vpsrld $1, %xmm1, %xmm1
; CHECK-EXPAND-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
; CHECK-EXPAND-NEXT: vpand %xmm3, %xmm2, %xmm2
; CHECK-EXPAND-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
; CHECK-EXPAND-NEXT: vpor %xmm1, %xmm0, %xmm0
; CHECK-EXPAND-NEXT: retq
;
; CHECK-UNEXPAND-LABEL: test_fshl_none_constant:
; CHECK-UNEXPAND: # %bb.0:
; CHECK-UNEXPAND-NEXT: vpshldvd %xmm2, %xmm1, %xmm0
; CHECK-UNEXPAND-NEXT: retq
%res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
ret <4 x i32> %res
}

define <4 x i32> @test_fshr_constants() {
; CHECK-LABEL: test_fshr_constants:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,8388608,8388608,6291456]
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> <i32 8, i32 9, i32 10, i32 11>)
ret <4 x i32> %res
}

define <4 x i32> @test_fshr_two_constants(<4 x i32> %a) {
; CHECK-EXPAND-LABEL: test_fshr_two_constants:
; CHECK-EXPAND: # %bb.0:
; CHECK-EXPAND-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-EXPAND-NEXT: retq
;
; CHECK-UNEXPAND-LABEL: test_fshr_two_constants:
; CHECK-UNEXPAND: # %bb.0:
; CHECK-UNEXPAND-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7]
; CHECK-UNEXPAND-NEXT: vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-UNEXPAND-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-UNEXPAND-NEXT: retq
%res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> <i32 8, i32 9, i32 10, i32 11>)
ret <4 x i32> %res
}

define <4 x i32> @test_fshr_one_constant(<4 x i32> %a, <4 x i32> %b) {
; CHECK-EXPAND-LABEL: test_fshr_one_constant:
; CHECK-EXPAND: # %bb.0:
; CHECK-EXPAND-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; CHECK-EXPAND-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-EXPAND-NEXT: vpor %xmm1, %xmm0, %xmm0
; CHECK-EXPAND-NEXT: retq
;
; CHECK-UNEXPAND-LABEL: test_fshr_one_constant:
; CHECK-UNEXPAND: # %bb.0:
; CHECK-UNEXPAND-NEXT: vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-UNEXPAND-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-UNEXPAND-NEXT: retq
%res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 8, i32 9, i32 10, i32 11>)
ret <4 x i32> %res
}

define <4 x i32> @test_fshr_none_constant(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; CHECK-EXPAND-LABEL: test_fshr_none_constant:
; CHECK-EXPAND: # %bb.0:
; CHECK-EXPAND-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
; CHECK-EXPAND-NEXT: vpand %xmm3, %xmm2, %xmm4
; CHECK-EXPAND-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
; CHECK-EXPAND-NEXT: vpandn %xmm3, %xmm2, %xmm2
; CHECK-EXPAND-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; CHECK-EXPAND-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
; CHECK-EXPAND-NEXT: vpor %xmm1, %xmm0, %xmm0
; CHECK-EXPAND-NEXT: retq
;
; CHECK-UNEXPAND-LABEL: test_fshr_none_constant:
; CHECK-UNEXPAND: # %bb.0:
; CHECK-UNEXPAND-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1
; CHECK-UNEXPAND-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-UNEXPAND-NEXT: retq
%res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
ret <4 x i32> %res
}

define <4 x i32> @test_fshr_splat_constants() {
; CHECK-LABEL: test_fshr_splat_constants:
; CHECK: # %bb.0:
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [16777216,16777216,16777216,16777216]
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 4, i32 4, i32 4, i32 4>, <4 x i32> <i32 8, i32 8, i32 8, i32 8>)
ret <4 x i32> %res
}
Loading