Skip to content

Commit ba603b5

Browse files
authored
[msan] Improve packed multiply-add instrumentation (#152941)
The current instrumentation has false positives: if there is a single uninitialized bit in any of the operands, the entire output is poisoned. This does not take into account that multiplying an uninitialized value with zero results in an initialized zero value. This step allows elements that are zero to clear the corresponding shadow during the multiplication step. The horizontal add step and accumulation step (if any) are modeled using bitwise OR. Future work can apply this improved handler to the AVX512 equivalent intrinsics (x86_avx512_pmaddw_d_512, x86_avx512_pmaddubs_w_512.) and AVX VNNI intrinsics.
1 parent f4dd442 commit ba603b5

File tree

8 files changed

+280
-103
lines changed

8 files changed

+280
-103
lines changed

llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp

Lines changed: 87 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3641,9 +3641,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
36413641
setOriginForNaryOp(I);
36423642
}
36433643

3644-
// Get an MMX-sized vector type.
3645-
Type *getMMXVectorTy(unsigned EltSizeInBits) {
3646-
const unsigned X86_MMXSizeInBits = 64;
3644+
// Get an MMX-sized (64-bit) vector type, or optionally, other sized
3645+
// vectors.
3646+
Type *getMMXVectorTy(unsigned EltSizeInBits,
3647+
unsigned X86_MMXSizeInBits = 64) {
36473648
assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
36483649
"Illegal MMX vector element size");
36493650
return FixedVectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
@@ -3843,20 +3844,78 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
38433844
setOriginForNaryOp(I);
38443845
}
38453846

3846-
// Instrument multiply-add intrinsic.
3847-
void handleVectorPmaddIntrinsic(IntrinsicInst &I,
3848-
unsigned MMXEltSizeInBits = 0) {
3849-
Type *ResTy =
3850-
MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits * 2) : I.getType();
3847+
// Instrument multiply-add intrinsics.
3848+
//
3849+
// e.g., Two operands:
3850+
// <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b)
3851+
// <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
3852+
//
3853+
// Three operands are not implemented yet:
3854+
// <4 x i32> @llvm.x86.avx512.vpdpbusd.128
3855+
// (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b)
3856+
// (the result of multiply-add'ing %a and %b is accumulated with %s)
3857+
void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
3858+
unsigned EltSizeInBits = 0) {
38513859
IRBuilder<> IRB(&I);
3852-
auto *Shadow0 = getShadow(&I, 0);
3853-
auto *Shadow1 = getShadow(&I, 1);
3854-
Value *S = IRB.CreateOr(Shadow0, Shadow1);
3855-
S = IRB.CreateBitCast(S, ResTy);
3856-
S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
3857-
ResTy);
3858-
S = IRB.CreateBitCast(S, getShadowTy(&I));
3859-
setShadow(&I, S);
3860+
3861+
[[maybe_unused]] FixedVectorType *ReturnType =
3862+
cast<FixedVectorType>(I.getType());
3863+
assert(isa<FixedVectorType>(ReturnType));
3864+
3865+
assert(I.arg_size() == 2);
3866+
3867+
// Vectors A and B, and shadows
3868+
Value *Va = I.getOperand(0);
3869+
Value *Vb = I.getOperand(1);
3870+
3871+
Value *Sa = getShadow(&I, 0);
3872+
Value *Sb = getShadow(&I, 1);
3873+
3874+
FixedVectorType *ParamType =
3875+
cast<FixedVectorType>(I.getArgOperand(0)->getType());
3876+
assert(ParamType == I.getArgOperand(1)->getType());
3877+
3878+
assert(ParamType->getPrimitiveSizeInBits() ==
3879+
ReturnType->getPrimitiveSizeInBits());
3880+
3881+
// Step 1: instrument multiplication of corresponding vector elements
3882+
if (EltSizeInBits) {
3883+
ParamType = cast<FixedVectorType>(
3884+
getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits()));
3885+
3886+
Va = IRB.CreateBitCast(Va, ParamType);
3887+
Vb = IRB.CreateBitCast(Vb, ParamType);
3888+
3889+
Sa = IRB.CreateBitCast(Sa, getShadowTy(ParamType));
3890+
Sb = IRB.CreateBitCast(Sb, getShadowTy(ParamType));
3891+
} else {
3892+
assert(ParamType->getNumElements() ==
3893+
ReturnType->getNumElements() * ReductionFactor);
3894+
}
3895+
3896+
Value *Sab = IRB.CreateOr(Sa, Sb);
3897+
3898+
// Multiplying an uninitialized / element by zero results in an initialized
3899+
// element.
3900+
Value *Zero = Constant::getNullValue(Va->getType());
3901+
Value *VaNotZero = IRB.CreateICmpNE(Va, Zero);
3902+
Value *VbNotZero = IRB.CreateICmpNE(Vb, Zero);
3903+
Value *VaAndVbNotZero = IRB.CreateAnd(VaNotZero, VbNotZero);
3904+
3905+
// After multiplying e.g., <8 x i16> %a, <8 x i16> %b, we should have
3906+
// <8 x i32> %ab, but we cheated and ended up with <8 x i16>.
3907+
Sab = IRB.CreateAnd(Sab, IRB.CreateSExt(VaAndVbNotZero, Sab->getType()));
3908+
3909+
// Step 2: instrument horizontal add
3910+
// e.g., collapse <8 x i16> into <4 x i16> (reduction factor == 2)
3911+
// <16 x i8> into <4 x i8> (reduction factor == 4)
3912+
Value *OutShadow = horizontalReduce(I, ReductionFactor, Sab, nullptr);
3913+
3914+
// Extend to <4 x i32>.
3915+
// For MMX, cast it back to <1 x i64>.
3916+
OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
3917+
3918+
setShadow(&I, OutShadow);
38603919
setOriginForNaryOp(I);
38613920
}
38623921

@@ -5391,19 +5450,28 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
53915450
handleVectorSadIntrinsic(I);
53925451
break;
53935452

5453+
// Multiply and Add Packed Words
5454+
// < 4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
5455+
// < 8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
5456+
5457+
// Multiply and Add Packed Signed and Unsigned Bytes
5458+
// < 8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
5459+
// <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>)
53945460
case Intrinsic::x86_sse2_pmadd_wd:
53955461
case Intrinsic::x86_avx2_pmadd_wd:
53965462
case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
53975463
case Intrinsic::x86_avx2_pmadd_ub_sw:
5398-
handleVectorPmaddIntrinsic(I);
5464+
handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2);
53995465
break;
54005466

5467+
// <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
54015468
case Intrinsic::x86_ssse3_pmadd_ub_sw:
5402-
handleVectorPmaddIntrinsic(I, 8);
5469+
handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8);
54035470
break;
54045471

5472+
// <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
54055473
case Intrinsic::x86_mmx_pmadd_wd:
5406-
handleVectorPmaddIntrinsic(I, 16);
5474+
handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
54075475
break;
54085476

54095477
case Intrinsic::x86_sse_cmp_ss:

llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -141,10 +141,16 @@ define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
141141
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
142142
; CHECK-NEXT: call void @llvm.donothing()
143143
; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
144-
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i16> [[TMP3]] to <8 x i32>
145-
; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP4]], zeroinitializer
146-
; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i32>
147-
; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
144+
; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i16> [[A0:%.*]], zeroinitializer
145+
; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i16> [[A1:%.*]], zeroinitializer
146+
; CHECK-NEXT: [[TMP11:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
147+
; CHECK-NEXT: [[TMP12:%.*]] = sext <16 x i1> [[TMP11]] to <16 x i16>
148+
; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i16> [[TMP3]], [[TMP12]]
149+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
150+
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
151+
; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i16> [[TMP8]], [[TMP9]]
152+
; CHECK-NEXT: [[TMP6:%.*]] = zext <8 x i16> [[TMP10]] to <8 x i32>
153+
; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0]], <16 x i16> [[A1]])
148154
; CHECK-NEXT: store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
149155
; CHECK-NEXT: ret <8 x i32> [[RES]]
150156
;
@@ -678,10 +684,16 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
678684
; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
679685
; CHECK-NEXT: call void @llvm.donothing()
680686
; CHECK-NEXT: [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
681-
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <32 x i8> [[TMP3]] to <16 x i16>
682-
; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
683-
; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i16>
684-
; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
687+
; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <32 x i8> [[A0:%.*]], zeroinitializer
688+
; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
689+
; CHECK-NEXT: [[TMP11:%.*]] = and <32 x i1> [[TMP4]], [[TMP5]]
690+
; CHECK-NEXT: [[TMP12:%.*]] = sext <32 x i1> [[TMP11]] to <32 x i8>
691+
; CHECK-NEXT: [[TMP7:%.*]] = and <32 x i8> [[TMP3]], [[TMP12]]
692+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
693+
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
694+
; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i8> [[TMP8]], [[TMP9]]
695+
; CHECK-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i16>
696+
; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
685697
; CHECK-NEXT: store <16 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8
686698
; CHECK-NEXT: ret <16 x i16> [[RES]]
687699
;
@@ -707,10 +719,16 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
707719
; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
708720
; CHECK-NEXT: [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP7]], align 32
709721
; CHECK-NEXT: [[TMP8:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]]
710-
; CHECK-NEXT: [[TMP9:%.*]] = bitcast <32 x i8> [[TMP8]] to <16 x i16>
711-
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP9]], zeroinitializer
712-
; CHECK-NEXT: [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i16>
713-
; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1:%.*]])
722+
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i8> [[A0]], zeroinitializer
723+
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
724+
; CHECK-NEXT: [[TMP16:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]]
725+
; CHECK-NEXT: [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8>
726+
; CHECK-NEXT: [[TMP12:%.*]] = and <32 x i8> [[TMP8]], [[TMP17]]
727+
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
728+
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
729+
; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i8> [[TMP13]], [[TMP14]]
730+
; CHECK-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[TMP15]] to <16 x i16>
731+
; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
714732
; CHECK-NEXT: store <16 x i16> [[TMP11]], ptr @__msan_retval_tls, align 8
715733
; CHECK-NEXT: ret <16 x i16> [[RES]]
716734
;

llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll

Lines changed: 41 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1687,16 +1687,27 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
16871687
; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
16881688
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
16891689
; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
1690-
; CHECK-NEXT: [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
1691-
; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
1692-
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
1693-
; CHECK-NEXT: [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
1694-
; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
1695-
; CHECK-NEXT: [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
1696-
; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
1690+
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[MMX_VAR_I]] to <4 x i16>
1691+
; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[MMX_VAR1_I]] to <4 x i16>
1692+
; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP6]] to <4 x i16>
1693+
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
1694+
; CHECK-NEXT: [[TMP22:%.*]] = or <4 x i16> [[TMP10]], [[TMP11]]
1695+
; CHECK-NEXT: [[TMP29:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer
1696+
; CHECK-NEXT: [[TMP30:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
1697+
; CHECK-NEXT: [[TMP31:%.*]] = and <4 x i1> [[TMP29]], [[TMP30]]
1698+
; CHECK-NEXT: [[TMP32:%.*]] = sext <4 x i1> [[TMP31]] to <4 x i16>
1699+
; CHECK-NEXT: [[TMP23:%.*]] = and <4 x i16> [[TMP22]], [[TMP32]]
1700+
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
1701+
; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <2 x i32> <i32 1, i32 3>
1702+
; CHECK-NEXT: [[TMP26:%.*]] = or <2 x i16> [[TMP24]], [[TMP25]]
1703+
; CHECK-NEXT: [[TMP27:%.*]] = bitcast <2 x i16> [[TMP26]] to i32
1704+
; CHECK-NEXT: [[TMP28:%.*]] = zext i32 [[TMP27]] to i64
1705+
; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP28]] to <1 x i64>
1706+
; CHECK-NEXT: [[TMP33:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
16971707
; CHECK-NEXT: [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32>
1698-
; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
1699-
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
1708+
; CHECK-NEXT: [[TMP34:%.*]] = bitcast <1 x i64> [[TMP33]] to <2 x i32>
1709+
; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
1710+
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <2 x i32> [[TMP34]] to <1 x i64>
17001711
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
17011712
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0
17021713
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3315,16 +3326,27 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
33153326
; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
33163327
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64>
33173328
; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
3318-
; CHECK-NEXT: [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]]
3319-
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
3320-
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
3321-
; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
3322-
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
3323-
; CHECK-NEXT: [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
3324-
; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8>
3325-
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
3326-
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
3327-
; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
3329+
; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP22]] to <8 x i8>
3330+
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP23]] to <8 x i8>
3331+
; CHECK-NEXT: [[TMP12:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
3332+
; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
3333+
; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i8> [[TMP12]], [[TMP13]]
3334+
; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer
3335+
; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
3336+
; CHECK-NEXT: [[TMP34:%.*]] = and <8 x i1> [[TMP32]], [[TMP33]]
3337+
; CHECK-NEXT: [[TMP35:%.*]] = sext <8 x i1> [[TMP34]] to <8 x i8>
3338+
; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i8> [[TMP14]], [[TMP35]]
3339+
; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
3340+
; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <8 x i8> [[TMP16]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
3341+
; CHECK-NEXT: [[TMP27:%.*]] = or <4 x i8> [[TMP25]], [[TMP26]]
3342+
; CHECK-NEXT: [[TMP29:%.*]] = bitcast <4 x i8> [[TMP27]] to i32
3343+
; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP29]] to i64
3344+
; CHECK-NEXT: [[TMP30:%.*]] = bitcast i64 [[TMP24]] to <1 x i64>
3345+
; CHECK-NEXT: [[TMP36:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
3346+
; CHECK-NEXT: [[TMP31:%.*]] = bitcast <1 x i64> [[TMP30]] to <8 x i8>
3347+
; CHECK-NEXT: [[TMP28:%.*]] = bitcast <1 x i64> [[TMP36]] to <8 x i8>
3348+
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
3349+
; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i8> [[TMP28]] to <1 x i64>
33283350
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
33293351
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <1 x i64> [[TMP19]], i32 0
33303352
; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8

llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -763,10 +763,16 @@ define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 {
763763
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
764764
; CHECK-NEXT: call void @llvm.donothing()
765765
; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
766-
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <4 x i32>
767-
; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
768-
; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
769-
; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]])
766+
; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i16> [[A0:%.*]], zeroinitializer
767+
; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[A1:%.*]], zeroinitializer
768+
; CHECK-NEXT: [[TMP11:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
769+
; CHECK-NEXT: [[TMP12:%.*]] = sext <8 x i1> [[TMP11]] to <8 x i16>
770+
; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i16> [[TMP3]], [[TMP12]]
771+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
772+
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
773+
; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i16> [[TMP8]], [[TMP9]]
774+
; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i16> [[TMP10]] to <4 x i32>
775+
; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0]], <8 x i16> [[A1]])
770776
; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
771777
; CHECK-NEXT: ret <4 x i32> [[RES]]
772778
;

0 commit comments

Comments
 (0)