Skip to content

Commit ef7dbe6

Browse files
RKSimontstellar
authored andcommitted
[X86][SSE] combineX86ShufflesConstants - early out for zeroable vectors (PR45443)
Shuffle combining can insert zero byte sized elements into the shuffle mask, which combineX86ShufflesConstants will attempt to fold without taking into account whether the byte-sized type is legal (e.g. AVX512F only targets). If we have a full-zeroable vector then we should just return a zero version of the root type, otherwise if the type isn't valid we should bail. Fixes PR45443 (cherry picked from commit e3b6059)
1 parent 5fbba36 commit ef7dbe6

File tree

2 files changed

+28
-1
lines changed

2 files changed

+28
-1
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33998,6 +33998,7 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
3399833998
return SDValue();
3399933999

3400034000
// Shuffle the constant bits according to the mask.
34001+
SDLoc DL(Root);
3400134002
APInt UndefElts(NumMaskElts, 0);
3400234003
APInt ZeroElts(NumMaskElts, 0);
3400334004
APInt ConstantElts(NumMaskElts, 0);
@@ -34035,6 +34036,10 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
3403534036
}
3403634037
assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
3403734038

34039+
// Attempt to create a zero vector.
34040+
if ((UndefElts | ZeroElts).isAllOnesValue())
34041+
return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
34042+
3403834043
// Create the constant data.
3403934044
MVT MaskSVT;
3404034045
if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
@@ -34043,8 +34048,9 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
3404334048
MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
3404434049

3404534050
MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
34051+
if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
34052+
return SDValue();
3404634053

34047-
SDLoc DL(Root);
3404834054
SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
3404934055
return DAG.getBitcast(VT, CstOp);
3405034056
}

llvm/test/CodeGen/X86/pr45443.ll

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=i686-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86
3+
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64
4+
5+
define <16 x float> @PR45443() {
6+
; CHECK-LABEL: PR45443:
7+
; CHECK: # %bb.0: # %bb
8+
; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm0
9+
; CHECK-NEXT: ret{{[l|q]}}
10+
bb:
11+
%tmp = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> <i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040>, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>)
12+
%tmp4 = tail call fast <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> <float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000>, <16 x float> undef)
13+
%tmp5 = icmp ult <16 x i32> %tmp, <i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216>
14+
%tmp6 = and <16 x i32> %tmp, <i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215>
15+
%tmp7 = icmp ne <16 x i32> %tmp6, zeroinitializer
16+
%tmp8 = and <16 x i1> %tmp7, %tmp5
17+
%tmp9 = select fast <16 x i1> %tmp8, <16 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>, <16 x float> %tmp4
18+
ret <16 x float> %tmp9
19+
}
20+
declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>)
21+
declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>)

0 commit comments

Comments
 (0)