Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 171 additions & 12 deletions llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3846,15 +3846,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}

// Instrument multiply-add intrinsics.
// Instrument multiply-add(-accumulate)? intrinsics.
//
// e.g., Two operands:
// <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b)
//
// Two operands which require an EltSizeInBits override:
// <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
//
// Three operands are not implemented yet:
// Three operands:
// <4 x i32> @llvm.x86.avx512.vpdpbusd.128
// (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b)
// (the result of multiply-add'ing %a and %b is accumulated with %s)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What operation is "accumulated" here? Can you clarify in the comment?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've elaborated

Expand All @@ -3866,22 +3866,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
cast<FixedVectorType>(I.getType());
assert(isa<FixedVectorType>(ReturnType));

assert(I.arg_size() == 2);

// Vectors A and B, and shadows
Value *Va = I.getOperand(0);
Value *Vb = I.getOperand(1);
Value *Va = nullptr;
Value *Vb = nullptr;
Value *Sa = nullptr;
Value *Sb = nullptr;

Value *Sa = getShadow(&I, 0);
Value *Sb = getShadow(&I, 1);
if (I.arg_size() == 2) {
Va = I.getOperand(0);
Vb = I.getOperand(1);

Sa = getShadow(&I, 0);
Sb = getShadow(&I, 1);
} else if (I.arg_size() == 3) {
// Operand 0 is the accumulator. We will deal with that below.
Va = I.getOperand(1);
Vb = I.getOperand(2);

Sa = getShadow(&I, 1);
Sb = getShadow(&I, 2);
} else {
assert(I.arg_size() == 2 || I.arg_size() == 3);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Put this in front of the if and leave out the else branch? It's cleaner dot png?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

}

FixedVectorType *ParamType =
cast<FixedVectorType>(I.getArgOperand(0)->getType());
assert(ParamType == I.getArgOperand(1)->getType());
FixedVectorType *ParamType = cast<FixedVectorType>(Va->getType());
assert(ParamType == Vb->getType());

assert(ParamType->getPrimitiveSizeInBits() ==
ReturnType->getPrimitiveSizeInBits());

if (I.arg_size() == 3) {
assert(ParamType == ReturnType);
assert(ParamType == I.getArgOperand(0)->getType());
}

FixedVectorType *ImplicitReturnType = ReturnType;
// Step 1: instrument multiplication of corresponding vector elements
if (EltSizeInBits) {
Expand Down Expand Up @@ -3944,10 +3962,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Constant::getNullValue(Horizontal->getType())),
ImplicitReturnType);

// For MMX, cast it back to the required fake return type (<1 x i64>).
// Cast it back to the required fake return type (<1 x i64>).
if (EltSizeInBits)
OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));

// Step 3 (if applicable): instrument accumulator
if (I.arg_size() == 3)
OutShadow = IRB.CreateOr(OutShadow, getShadow(&I, 0));

setShadow(&I, OutShadow);
setOriginForNaryOp(I);
}
Expand Down Expand Up @@ -5507,6 +5529,143 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
break;

// AVX Vector Neural Network Instructions: bytes
//
// Multiply and Add Packed Signed and Unsigned Bytes
// < 4 x i32> @llvm.x86.avx512.vpdpbusd.128
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
// < 8 x i32> @llvm.x86.avx512.vpdpbusd.256
// (< 8 x i32>, < 8 x i32>, < 8 x i32>)
// <16 x i32> @llvm.x86.avx512.vpdpbusd.512
// (<16 x i32>, <16 x i32>, <16 x i32>)
//
// Multiply and Add Unsigned and Signed Bytes With Saturation
// < 4 x i32> @llvm.x86.avx512.vpdpbusds.128
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
// < 8 x i32> @llvm.x86.avx512.vpdpbusds.256
// (< 8 x i32>, < 8 x i32>, < 8 x i32>)
// <16 x i32> @llvm.x86.avx512.vpdpbusds.512
// (<16 x i32>, <16 x i32>, <16 x i32>)
//
// < 4 x i32> @llvm.x86.avx2.vpdpbssd.128
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
// < 8 x i32> @llvm.x86.avx2.vpdpbssd.256
// (< 8 x i32>, < 8 x i32>, < 8 x i32>)
//
// < 4 x i32> @llvm.x86.avx2.vpdpbssds.128
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
// < 8 x i32> @llvm.x86.avx2.vpdpbssds.256
// (< 8 x i32>, < 8 x i32>, < 8 x i32>)
//
// <16 x i32> @llvm.x86.avx10.vpdpbssd.512
// (<16 x i32>, <16 x i32>, <16 x i32>)
// <16 x i32> @llvm.x86.avx10.vpdpbssds.512
// (<16 x i32>, <16 x i32>, <16 x i32>)
//
// These intrinsics are auto-upgraded into non-masked forms:
// <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
// <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
// <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
// <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
// <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
// <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
//
// <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
// <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
// <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
// <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
// <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
// <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
case Intrinsic::x86_avx512_vpdpbusd_128:
case Intrinsic::x86_avx512_vpdpbusd_256:
case Intrinsic::x86_avx512_vpdpbusd_512:
case Intrinsic::x86_avx512_vpdpbusds_128:
case Intrinsic::x86_avx512_vpdpbusds_256:
case Intrinsic::x86_avx512_vpdpbusds_512:
case Intrinsic::x86_avx2_vpdpbssd_128:
case Intrinsic::x86_avx2_vpdpbssd_256:
case Intrinsic::x86_avx2_vpdpbssds_128:
case Intrinsic::x86_avx2_vpdpbssds_256:
case Intrinsic::x86_avx10_vpdpbssd_512:
case Intrinsic::x86_avx10_vpdpbssds_512:
handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8);
break;

// AVX Vector Neural Network Instructions: words
//
// Multiply and Add Signed Word Integers
// < 4 x i32> @llvm.x86.avx512.vpdpwssd.128
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
// < 8 x i32> @llvm.x86.avx512.vpdpwssd.256
// (< 8 x i32>, < 8 x i32>, < 8 x i32>)
// <16 x i32> @llvm.x86.avx512.vpdpwssd.512
// (<16 x i32>, <16 x i32>, <16 x i32>)
//
// Multiply and Add Signed Word Integers With Saturation
// < 4 x i32> @llvm.x86.avx512.vpdpwssds.128
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
// < 8 x i32> @llvm.x86.avx512.vpdpwssds.256
// (< 8 x i32>, < 8 x i32>, < 8 x i32>)
// <16 x i32> @llvm.x86.avx512.vpdpwssds.512
// (<16 x i32>, <16 x i32>, <16 x i32>)
//
// These intrinsics are auto-upgraded into non-masked forms:
// <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
// <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
// <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
// <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
// <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
// <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
//
// <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
// <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
// <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
// <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
// <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
// <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
case Intrinsic::x86_avx512_vpdpwssd_128:
case Intrinsic::x86_avx512_vpdpwssd_256:
case Intrinsic::x86_avx512_vpdpwssd_512:
case Intrinsic::x86_avx512_vpdpwssds_128:
case Intrinsic::x86_avx512_vpdpwssds_256:
case Intrinsic::x86_avx512_vpdpwssds_512:
handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
break;

// TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single
// Precision
// <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128
// (<4 x float>, <8 x bfloat>, <8 x bfloat>)
// <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256
// (<8 x float>, <16 x bfloat>, <16 x bfloat>)
// <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512
// (<16 x float>, <32 x bfloat>, <32 x bfloat>)
// handleVectorPmaddIntrinsic() currently only handles integer types.

case Intrinsic::x86_sse_cmp_ss:
case Intrinsic::x86_sse2_cmp_sd:
case Intrinsic::x86_sse_comieq_ss:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,7 @@
; - llvm.x86.avx10.vdpphps.512
; - llvm.x86.avx10.vmpsadbw.512
;
; Handled heuristically:
; - llvm.x86.avx10.vpdpbssd.512
; - llvm.x86.avx10.vpdpbssds.512
; - llvm.x86.avx10.vpdpbsud.512
; - llvm.x86.avx10.vpdpbsuds.512
; - llvm.x86.avx10.vpdpbuud.512
; - llvm.x86.avx10.vpdpbuuds.512
; - llvm.x86.avx10.vpdpwsud.512
; - llvm.x86.avx10.vpdpwsuds.512
; - llvm.x86.avx10.vpdpwusd.512
; - llvm.x86.avx10.vpdpwusds.512
; - llvm.x86.avx10.vpdpwuud.512
; - llvm.x86.avx10.vpdpwuuds.512
; Handled heuristically: (none)

target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
Expand Down Expand Up @@ -140,8 +128,8 @@ define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
; CHECK-LABEL: define <16 x i32> @test_mm512_dpbssd_epi32(
; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
Expand All @@ -154,8 +142,26 @@ define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[__A]] to <64 x i8>
; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[__B]] to <64 x i8>
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <64 x i8>
; CHECK-NEXT: [[TMP13:%.*]] = icmp ne <64 x i8> [[TMP11]], zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <64 x i8> [[TMP12]], zeroinitializer
; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <64 x i8> [[TMP9]], zeroinitializer
; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <64 x i8> [[TMP10]], zeroinitializer
; CHECK-NEXT: [[TMP17:%.*]] = and <64 x i1> [[TMP13]], [[TMP14]]
; CHECK-NEXT: [[TMP18:%.*]] = and <64 x i1> [[TMP15]], [[TMP14]]
; CHECK-NEXT: [[TMP19:%.*]] = and <64 x i1> [[TMP13]], [[TMP16]]
; CHECK-NEXT: [[TMP20:%.*]] = or <64 x i1> [[TMP17]], [[TMP18]]
; CHECK-NEXT: [[TMP21:%.*]] = or <64 x i1> [[TMP20]], [[TMP19]]
; CHECK-NEXT: [[TMP22:%.*]] = sext <64 x i1> [[TMP21]] to <64 x i8>
; CHECK-NEXT: [[TMP23:%.*]] = bitcast <64 x i8> [[TMP22]] to <32 x i16>
; CHECK-NEXT: [[TMP24:%.*]] = icmp ne <32 x i16> [[TMP23]], zeroinitializer
; CHECK-NEXT: [[TMP25:%.*]] = sext <32 x i1> [[TMP24]] to <32 x i16>
; CHECK-NEXT: [[TMP26:%.*]] = bitcast <32 x i16> [[TMP25]] to i512
; CHECK-NEXT: [[TMP27:%.*]] = bitcast i512 [[TMP26]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP27]], [[TMP4]]
; CHECK-NEXT: [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i32> [[RES]]
Expand All @@ -168,13 +174,31 @@ define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpbssds_epi32(
; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i32> [[__A]] to <64 x i8>
; CHECK-NEXT: [[TMP25:%.*]] = bitcast <16 x i32> [[__B]] to <64 x i8>
; CHECK-NEXT: [[TMP26:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
; CHECK-NEXT: [[TMP27:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
; CHECK-NEXT: [[TMP28:%.*]] = icmp ne <64 x i8> [[TMP26]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP27]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <64 x i8> [[TMP25]], zeroinitializer
; CHECK-NEXT: [[TMP13:%.*]] = and <64 x i1> [[TMP28]], [[TMP10]]
; CHECK-NEXT: [[TMP14:%.*]] = and <64 x i1> [[TMP11]], [[TMP10]]
; CHECK-NEXT: [[TMP15:%.*]] = and <64 x i1> [[TMP28]], [[TMP12]]
; CHECK-NEXT: [[TMP16:%.*]] = or <64 x i1> [[TMP13]], [[TMP14]]
; CHECK-NEXT: [[TMP17:%.*]] = or <64 x i1> [[TMP16]], [[TMP15]]
; CHECK-NEXT: [[TMP18:%.*]] = sext <64 x i1> [[TMP17]] to <64 x i8>
; CHECK-NEXT: [[TMP19:%.*]] = bitcast <64 x i8> [[TMP18]] to <32 x i16>
; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <32 x i16> [[TMP19]], zeroinitializer
; CHECK-NEXT: [[TMP21:%.*]] = sext <32 x i1> [[TMP20]] to <32 x i16>
; CHECK-NEXT: [[TMP22:%.*]] = bitcast <32 x i16> [[TMP21]] to i512
; CHECK-NEXT: [[TMP23:%.*]] = bitcast i512 [[TMP22]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP23]], [[TMP1]]
; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
Expand All @@ -196,13 +220,31 @@ define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %_
define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpbssd_epi32(
; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
; CHECK-NEXT: [[TMP24:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
; CHECK-NEXT: [[TMP25:%.*]] = bitcast <16 x i32> [[__A]] to <64 x i8>
; CHECK-NEXT: [[TMP26:%.*]] = bitcast <16 x i32> [[__B]] to <64 x i8>
; CHECK-NEXT: [[TMP27:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
; CHECK-NEXT: [[TMP28:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
; CHECK-NEXT: [[TMP29:%.*]] = icmp ne <64 x i8> [[TMP27]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP28]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP25]], zeroinitializer
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <64 x i8> [[TMP26]], zeroinitializer
; CHECK-NEXT: [[TMP13:%.*]] = and <64 x i1> [[TMP29]], [[TMP10]]
; CHECK-NEXT: [[TMP14:%.*]] = and <64 x i1> [[TMP11]], [[TMP10]]
; CHECK-NEXT: [[TMP15:%.*]] = and <64 x i1> [[TMP29]], [[TMP12]]
; CHECK-NEXT: [[TMP16:%.*]] = or <64 x i1> [[TMP13]], [[TMP14]]
; CHECK-NEXT: [[TMP17:%.*]] = or <64 x i1> [[TMP16]], [[TMP15]]
; CHECK-NEXT: [[TMP18:%.*]] = sext <64 x i1> [[TMP17]] to <64 x i8>
; CHECK-NEXT: [[TMP19:%.*]] = bitcast <64 x i8> [[TMP18]] to <32 x i16>
; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <32 x i16> [[TMP19]], zeroinitializer
; CHECK-NEXT: [[TMP21:%.*]] = sext <32 x i1> [[TMP20]] to <32 x i16>
; CHECK-NEXT: [[TMP22:%.*]] = bitcast <32 x i16> [[TMP21]] to i512
; CHECK-NEXT: [[TMP23:%.*]] = bitcast i512 [[TMP22]] to <16 x i32>
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP23]], [[TMP24]]
; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
Expand Down
Loading