Skip to content

Commit 2b75ff1

Browse files
authored
[msan] Reland with even more improvement: Improve packed multiply-add instrumentation (#153353)
This reverts commit cf00284 i.e., relands ba603b5. It was reverted because it was subtly wrong: multiplying an uninitialized zero should not result in an initialized zero. This reland fixes the issue by using instrumentation analogous to visitAnd (bitwise AND of an initialized zero and an uninitialized value results in an initialized value). Additionally, this reland expands a test case; fixes the commit message; and optimizes the change to avoid the need for horizontalReduce. The current instrumentation has false positives: it does not take into account that multiplying an initialized zero value with an uninitialized value results in an initialized zero value This change fixes the issue during the multiplication step. The horizontal add step is modeled using bitwise OR. Future work can apply this improved handler to the AVX512 equivalent intrinsics (x86_avx512_pmaddw_d_512, x86_avx512_pmaddubs_w_512.) and AVX VNNI intrinsics.
1 parent 4485a3f commit 2b75ff1

File tree

9 files changed

+384
-109
lines changed

9 files changed

+384
-109
lines changed

compiler-rt/lib/msan/tests/msan_test.cpp

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4271,14 +4271,39 @@ TEST(VectorSadTest, sse2_psad_bw) {
42714271
}
42724272

42734273
TEST(VectorMaddTest, mmx_pmadd_wd) {
4274-
V4x16 a = {Poisoned<U2>(), 1, 2, 3};
4274+
V4x16 a = {Poisoned<U2>(0), 1, 2, 3};
42754275
V4x16 b = {100, 101, 102, 103};
42764276
V2x32 c = _mm_madd_pi16(a, b);
4277+
// Multiply step:
4278+
// {Poison * 100, 1 * 101, 2 * 102, 3 * 103}
4279+
// == {Poison, 1 * 101, 2 * 102, 3 * 103}
4280+
// Notice that for the poisoned value, we ignored the concrete zero value.
4281+
//
4282+
// Horizontal add step:
4283+
// {Poison + 1 * 101, 2 * 102 + 3 * 103}
4284+
// == {Poison, 2 * 102 + 3 * 103}
42774285

42784286
EXPECT_POISONED(c[0]);
42794287
EXPECT_NOT_POISONED(c[1]);
42804288

42814289
EXPECT_EQ((unsigned)(2 * 102 + 3 * 103), c[1]);
4290+
4291+
V4x16 d = {Poisoned<U2>(0), 1, 0, 3};
4292+
V4x16 e = {100, 101, Poisoned<U2>(102), 103};
4293+
V2x32 f = _mm_madd_pi16(d, e);
4294+
// Multiply step:
4295+
// {Poison * 100, 1 * 101, 0 * Poison, 3 * 103}
4296+
// == {Poison, 1 * 101, 0 , 3 * 103}
4297+
// Notice that 0 * Poison == 0.
4298+
//
4299+
// Horizontal add step:
4300+
// {Poison + 1 * 101, 0 + 3 * 103}
4301+
// == {Poison, 3 * 103}
4302+
4303+
EXPECT_POISONED(f[0]);
4304+
EXPECT_NOT_POISONED(f[1]);
4305+
4306+
EXPECT_EQ((unsigned)(3 * 103), f[1]);
42824307
}
42834308

42844309
TEST(VectorCmpTest, mm_cmpneq_ps) {

llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp

Lines changed: 118 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3641,9 +3641,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
36413641
setOriginForNaryOp(I);
36423642
}
36433643

3644-
// Get an MMX-sized vector type.
3645-
Type *getMMXVectorTy(unsigned EltSizeInBits) {
3646-
const unsigned X86_MMXSizeInBits = 64;
3644+
// Get an MMX-sized (64-bit) vector type, or optionally, other sized
3645+
// vectors.
3646+
Type *getMMXVectorTy(unsigned EltSizeInBits,
3647+
unsigned X86_MMXSizeInBits = 64) {
36473648
assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
36483649
"Illegal MMX vector element size");
36493650
return FixedVectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
@@ -3843,20 +3844,109 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
38433844
setOriginForNaryOp(I);
38443845
}
38453846

3846-
// Instrument multiply-add intrinsic.
3847-
void handleVectorPmaddIntrinsic(IntrinsicInst &I,
3848-
unsigned MMXEltSizeInBits = 0) {
3849-
Type *ResTy =
3850-
MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits * 2) : I.getType();
3847+
// Instrument multiply-add intrinsics.
3848+
//
3849+
// e.g., Two operands:
3850+
// <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b)
3851+
//
3852+
// Two operands which require an EltSizeInBits override:
3853+
// <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
3854+
//
3855+
// Three operands are not implemented yet:
3856+
// <4 x i32> @llvm.x86.avx512.vpdpbusd.128
3857+
// (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b)
3858+
// (the result of multiply-add'ing %a and %b is accumulated with %s)
3859+
void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
3860+
unsigned EltSizeInBits = 0) {
38513861
IRBuilder<> IRB(&I);
3852-
auto *Shadow0 = getShadow(&I, 0);
3853-
auto *Shadow1 = getShadow(&I, 1);
3854-
Value *S = IRB.CreateOr(Shadow0, Shadow1);
3855-
S = IRB.CreateBitCast(S, ResTy);
3856-
S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
3857-
ResTy);
3858-
S = IRB.CreateBitCast(S, getShadowTy(&I));
3859-
setShadow(&I, S);
3862+
3863+
[[maybe_unused]] FixedVectorType *ReturnType =
3864+
cast<FixedVectorType>(I.getType());
3865+
assert(isa<FixedVectorType>(ReturnType));
3866+
3867+
assert(I.arg_size() == 2);
3868+
3869+
// Vectors A and B, and shadows
3870+
Value *Va = I.getOperand(0);
3871+
Value *Vb = I.getOperand(1);
3872+
3873+
Value *Sa = getShadow(&I, 0);
3874+
Value *Sb = getShadow(&I, 1);
3875+
3876+
FixedVectorType *ParamType =
3877+
cast<FixedVectorType>(I.getArgOperand(0)->getType());
3878+
assert(ParamType == I.getArgOperand(1)->getType());
3879+
3880+
assert(ParamType->getPrimitiveSizeInBits() ==
3881+
ReturnType->getPrimitiveSizeInBits());
3882+
3883+
FixedVectorType *ImplicitReturnType = ReturnType;
3884+
// Step 1: instrument multiplication of corresponding vector elements
3885+
if (EltSizeInBits) {
3886+
ImplicitReturnType = cast<FixedVectorType>(getMMXVectorTy(
3887+
EltSizeInBits * 2, ParamType->getPrimitiveSizeInBits()));
3888+
ParamType = cast<FixedVectorType>(
3889+
getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits()));
3890+
3891+
Va = IRB.CreateBitCast(Va, ParamType);
3892+
Vb = IRB.CreateBitCast(Vb, ParamType);
3893+
3894+
Sa = IRB.CreateBitCast(Sa, getShadowTy(ParamType));
3895+
Sb = IRB.CreateBitCast(Sb, getShadowTy(ParamType));
3896+
} else {
3897+
assert(ParamType->getNumElements() ==
3898+
ReturnType->getNumElements() * ReductionFactor);
3899+
}
3900+
3901+
// Multiplying an *initialized* zero by an uninitialized element results in
3902+
// an initialized zero element.
3903+
//
3904+
// This is analogous to bitwise AND, where "AND" of 0 and a poisoned value
3905+
// results in an unpoisoned value. We can therefore adapt the visitAnd()
3906+
// instrumentation:
3907+
// OutShadow = (SaNonZero & SbNonZero)
3908+
// | (VaNonZero & SbNonZero)
3909+
// | (SaNonZero & VbNonZero)
3910+
// where non-zero is checked on a per-element basis (not per bit).
3911+
Value *SZero = Constant::getNullValue(Va->getType());
3912+
Value *VZero = Constant::getNullValue(Sa->getType());
3913+
Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero);
3914+
Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero);
3915+
Value *VaNonZero = IRB.CreateICmpNE(Va, VZero);
3916+
Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero);
3917+
3918+
Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero);
3919+
Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero);
3920+
Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero);
3921+
3922+
// Each element of the vector is represented by a single bit (poisoned or
3923+
// not) e.g., <8 x i1>.
3924+
Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
3925+
3926+
// Extend <8 x i1> to <8 x i16>.
3927+
// (The real pmadd intrinsic would have computed intermediate values of
3928+
// <8 x i32>, but that is irrelevant for our shadow purposes because we
3929+
// consider each element to be either fully initialized or fully
3930+
// uninitialized.)
3931+
And = IRB.CreateSExt(And, Sa->getType());
3932+
3933+
// Step 2: instrument horizontal add
3934+
// We don't need bit-precise horizontalReduce because we only want to check
3935+
// if each pair of elements is fully zero.
3936+
// Cast to <4 x i32>.
3937+
Value *Horizontal = IRB.CreateBitCast(And, ImplicitReturnType);
3938+
3939+
// Compute <4 x i1>, then extend back to <4 x i32>.
3940+
Value *OutShadow = IRB.CreateSExt(
3941+
IRB.CreateICmpNE(Horizontal,
3942+
Constant::getNullValue(Horizontal->getType())),
3943+
ImplicitReturnType);
3944+
3945+
// For MMX, cast it back to the required fake return type (<1 x i64>).
3946+
if (EltSizeInBits)
3947+
OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
3948+
3949+
setShadow(&I, OutShadow);
38603950
setOriginForNaryOp(I);
38613951
}
38623952

@@ -5391,19 +5481,28 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
53915481
handleVectorSadIntrinsic(I);
53925482
break;
53935483

5484+
// Multiply and Add Packed Words
5485+
// < 4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
5486+
// < 8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
5487+
//
5488+
// Multiply and Add Packed Signed and Unsigned Bytes
5489+
// < 8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
5490+
// <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>)
53945491
case Intrinsic::x86_sse2_pmadd_wd:
53955492
case Intrinsic::x86_avx2_pmadd_wd:
53965493
case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
53975494
case Intrinsic::x86_avx2_pmadd_ub_sw:
5398-
handleVectorPmaddIntrinsic(I);
5495+
handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2);
53995496
break;
54005497

5498+
// <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
54015499
case Intrinsic::x86_ssse3_pmadd_ub_sw:
5402-
handleVectorPmaddIntrinsic(I, 8);
5500+
handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8);
54035501
break;
54045502

5503+
// <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
54055504
case Intrinsic::x86_mmx_pmadd_wd:
5406-
handleVectorPmaddIntrinsic(I, 16);
5505+
handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
54075506
break;
54085507

54095508
case Intrinsic::x86_sse_cmp_ss:

llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -140,11 +140,20 @@ define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
140140
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
141141
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
142142
; CHECK-NEXT: call void @llvm.donothing()
143-
; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
144-
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i16> [[TMP3]] to <8 x i32>
145-
; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP4]], zeroinitializer
146-
; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i32>
147-
; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
143+
; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP1]], zeroinitializer
144+
; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
145+
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <16 x i16> [[A0:%.*]], zeroinitializer
146+
; CHECK-NEXT: [[TMP13:%.*]] = icmp ne <16 x i16> [[A1:%.*]], zeroinitializer
147+
; CHECK-NEXT: [[TMP11:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
148+
; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP12]], [[TMP5]]
149+
; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i1> [[TMP4]], [[TMP13]]
150+
; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i1> [[TMP11]], [[TMP14]]
151+
; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]]
152+
; CHECK-NEXT: [[TMP7:%.*]] = sext <16 x i1> [[TMP17]] to <16 x i16>
153+
; CHECK-NEXT: [[TMP18:%.*]] = bitcast <16 x i16> [[TMP7]] to <8 x i32>
154+
; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
155+
; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
156+
; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0]], <16 x i16> [[A1]])
148157
; CHECK-NEXT: store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
149158
; CHECK-NEXT: ret <8 x i32> [[RES]]
150159
;
@@ -677,11 +686,20 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
677686
; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
678687
; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
679688
; CHECK-NEXT: call void @llvm.donothing()
680-
; CHECK-NEXT: [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
681-
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <32 x i8> [[TMP3]] to <16 x i16>
682-
; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
683-
; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i16>
684-
; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
689+
; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <32 x i8> [[TMP1]], zeroinitializer
690+
; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
691+
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <32 x i8> [[A0:%.*]], zeroinitializer
692+
; CHECK-NEXT: [[TMP13:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
693+
; CHECK-NEXT: [[TMP11:%.*]] = and <32 x i1> [[TMP4]], [[TMP5]]
694+
; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP12]], [[TMP5]]
695+
; CHECK-NEXT: [[TMP15:%.*]] = and <32 x i1> [[TMP4]], [[TMP13]]
696+
; CHECK-NEXT: [[TMP16:%.*]] = or <32 x i1> [[TMP11]], [[TMP14]]
697+
; CHECK-NEXT: [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
698+
; CHECK-NEXT: [[TMP7:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i8>
699+
; CHECK-NEXT: [[TMP18:%.*]] = bitcast <32 x i8> [[TMP7]] to <16 x i16>
700+
; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP18]], zeroinitializer
701+
; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i16>
702+
; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
685703
; CHECK-NEXT: store <16 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8
686704
; CHECK-NEXT: ret <16 x i16> [[RES]]
687705
;
@@ -706,11 +724,20 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
706724
; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
707725
; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
708726
; CHECK-NEXT: [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP7]], align 32
709-
; CHECK-NEXT: [[TMP8:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]]
710-
; CHECK-NEXT: [[TMP9:%.*]] = bitcast <32 x i8> [[TMP8]] to <16 x i16>
711-
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP9]], zeroinitializer
712-
; CHECK-NEXT: [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i16>
713-
; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1:%.*]])
727+
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i8> [[_MSLD]], zeroinitializer
728+
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
729+
; CHECK-NEXT: [[TMP17:%.*]] = icmp ne <32 x i8> [[A0]], zeroinitializer
730+
; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer
731+
; CHECK-NEXT: [[TMP16:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]]
732+
; CHECK-NEXT: [[TMP19:%.*]] = and <32 x i1> [[TMP17]], [[TMP10]]
733+
; CHECK-NEXT: [[TMP20:%.*]] = and <32 x i1> [[TMP9]], [[TMP18]]
734+
; CHECK-NEXT: [[TMP21:%.*]] = or <32 x i1> [[TMP16]], [[TMP19]]
735+
; CHECK-NEXT: [[TMP22:%.*]] = or <32 x i1> [[TMP21]], [[TMP20]]
736+
; CHECK-NEXT: [[TMP12:%.*]] = sext <32 x i1> [[TMP22]] to <32 x i8>
737+
; CHECK-NEXT: [[TMP23:%.*]] = bitcast <32 x i8> [[TMP12]] to <16 x i16>
738+
; CHECK-NEXT: [[TMP24:%.*]] = icmp ne <16 x i16> [[TMP23]], zeroinitializer
739+
; CHECK-NEXT: [[TMP11:%.*]] = sext <16 x i1> [[TMP24]] to <16 x i16>
740+
; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]])
714741
; CHECK-NEXT: store <16 x i16> [[TMP11]], ptr @__msan_retval_tls, align 8
715742
; CHECK-NEXT: ret <16 x i16> [[RES]]
716743
;

llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll

Lines changed: 47 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1687,16 +1687,30 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
16871687
; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
16881688
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
16891689
; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
1690-
; CHECK-NEXT: [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
1691-
; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
1692-
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
1693-
; CHECK-NEXT: [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
1694-
; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
1695-
; CHECK-NEXT: [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
1696-
; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
1690+
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[MMX_VAR_I]] to <4 x i16>
1691+
; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[MMX_VAR1_I]] to <4 x i16>
1692+
; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP6]] to <4 x i16>
1693+
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
1694+
; CHECK-NEXT: [[TMP29:%.*]] = icmp ne <4 x i16> [[TMP10]], zeroinitializer
1695+
; CHECK-NEXT: [[TMP30:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
1696+
; CHECK-NEXT: [[TMP22:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer
1697+
; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
1698+
; CHECK-NEXT: [[TMP31:%.*]] = and <4 x i1> [[TMP29]], [[TMP30]]
1699+
; CHECK-NEXT: [[TMP35:%.*]] = and <4 x i1> [[TMP22]], [[TMP30]]
1700+
; CHECK-NEXT: [[TMP36:%.*]] = and <4 x i1> [[TMP29]], [[TMP32]]
1701+
; CHECK-NEXT: [[TMP37:%.*]] = or <4 x i1> [[TMP31]], [[TMP35]]
1702+
; CHECK-NEXT: [[TMP38:%.*]] = or <4 x i1> [[TMP37]], [[TMP36]]
1703+
; CHECK-NEXT: [[TMP23:%.*]] = sext <4 x i1> [[TMP38]] to <4 x i16>
1704+
; CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x i16> [[TMP23]] to <2 x i32>
1705+
; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP24]], zeroinitializer
1706+
; CHECK-NEXT: [[TMP27:%.*]] = sext <2 x i1> [[TMP25]] to <2 x i32>
1707+
; CHECK-NEXT: [[TMP28:%.*]] = bitcast <2 x i32> [[TMP27]] to i64
1708+
; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP28]] to <1 x i64>
1709+
; CHECK-NEXT: [[TMP33:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
16971710
; CHECK-NEXT: [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32>
1698-
; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
1699-
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
1711+
; CHECK-NEXT: [[TMP34:%.*]] = bitcast <1 x i64> [[TMP33]] to <2 x i32>
1712+
; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
1713+
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <2 x i32> [[TMP34]] to <1 x i64>
17001714
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
17011715
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0
17021716
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3315,16 +3329,30 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
33153329
; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
33163330
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64>
33173331
; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
3318-
; CHECK-NEXT: [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]]
3319-
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
3320-
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
3321-
; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
3322-
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
3323-
; CHECK-NEXT: [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
3324-
; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8>
3325-
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
3326-
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
3327-
; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
3332+
; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP22]] to <8 x i8>
3333+
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP23]] to <8 x i8>
3334+
; CHECK-NEXT: [[TMP12:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
3335+
; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
3336+
; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <8 x i8> [[TMP12]], zeroinitializer
3337+
; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <8 x i8> [[TMP13]], zeroinitializer
3338+
; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer
3339+
; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
3340+
; CHECK-NEXT: [[TMP34:%.*]] = and <8 x i1> [[TMP32]], [[TMP33]]
3341+
; CHECK-NEXT: [[TMP38:%.*]] = and <8 x i1> [[TMP35]], [[TMP33]]
3342+
; CHECK-NEXT: [[TMP39:%.*]] = and <8 x i1> [[TMP32]], [[TMP37]]
3343+
; CHECK-NEXT: [[TMP40:%.*]] = or <8 x i1> [[TMP34]], [[TMP38]]
3344+
; CHECK-NEXT: [[TMP41:%.*]] = or <8 x i1> [[TMP40]], [[TMP39]]
3345+
; CHECK-NEXT: [[TMP16:%.*]] = sext <8 x i1> [[TMP41]] to <8 x i8>
3346+
; CHECK-NEXT: [[TMP26:%.*]] = bitcast <8 x i8> [[TMP16]] to <4 x i16>
3347+
; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <4 x i16> [[TMP26]], zeroinitializer
3348+
; CHECK-NEXT: [[TMP29:%.*]] = sext <4 x i1> [[TMP25]] to <4 x i16>
3349+
; CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x i16> [[TMP29]] to i64
3350+
; CHECK-NEXT: [[TMP30:%.*]] = bitcast i64 [[TMP24]] to <1 x i64>
3351+
; CHECK-NEXT: [[TMP36:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
3352+
; CHECK-NEXT: [[TMP31:%.*]] = bitcast <1 x i64> [[TMP30]] to <8 x i8>
3353+
; CHECK-NEXT: [[TMP28:%.*]] = bitcast <1 x i64> [[TMP36]] to <8 x i8>
3354+
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
3355+
; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i8> [[TMP28]] to <1 x i64>
33283356
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
33293357
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <1 x i64> [[TMP19]], i32 0
33303358
; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8

0 commit comments

Comments
 (0)