Skip to content

Commit 54d6ae2

Browse files
committed
[msan] Handle AVX Vector Neural Network Instructions (VNNI)
This extends the pmadd handler (recently improved in #153353) to three-operand intrinsics (multiply-add-accumulate), and applies it to the AVX Vector Neural Network Instructions. Updates the tests from #153135
1 parent 627f801 commit 54d6ae2

File tree

9 files changed

+2066
-357
lines changed

9 files changed

+2066
-357
lines changed

llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp

Lines changed: 171 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3846,15 +3846,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
38463846
setOriginForNaryOp(I);
38473847
}
38483848

3849-
// Instrument multiply-add intrinsics.
3849+
// Instrument multiply-add(-accumulate)? intrinsics.
38503850
//
38513851
// e.g., Two operands:
38523852
// <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b)
38533853
//
38543854
// Two operands which require an EltSizeInBits override:
38553855
// <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
38563856
//
3857-
// Three operands are not implemented yet:
3857+
// Three operands:
38583858
// <4 x i32> @llvm.x86.avx512.vpdpbusd.128
38593859
// (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b)
38603860
// (the result of multiply-add'ing %a and %b is accumulated with %s)
@@ -3866,22 +3866,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
38663866
cast<FixedVectorType>(I.getType());
38673867
assert(isa<FixedVectorType>(ReturnType));
38683868

3869-
assert(I.arg_size() == 2);
3870-
38713869
// Vectors A and B, and shadows
3872-
Value *Va = I.getOperand(0);
3873-
Value *Vb = I.getOperand(1);
3870+
Value *Va = nullptr;
3871+
Value *Vb = nullptr;
3872+
Value *Sa = nullptr;
3873+
Value *Sb = nullptr;
38743874

3875-
Value *Sa = getShadow(&I, 0);
3876-
Value *Sb = getShadow(&I, 1);
3875+
if (I.arg_size() == 2) {
3876+
Va = I.getOperand(0);
3877+
Vb = I.getOperand(1);
3878+
3879+
Sa = getShadow(&I, 0);
3880+
Sb = getShadow(&I, 1);
3881+
} else if (I.arg_size() == 3) {
3882+
// Operand 0 is the accumulator. We will deal with that below.
3883+
Va = I.getOperand(1);
3884+
Vb = I.getOperand(2);
3885+
3886+
Sa = getShadow(&I, 1);
3887+
Sb = getShadow(&I, 2);
3888+
} else {
3889+
assert(I.arg_size() == 2 || I.arg_size() == 3);
3890+
}
38773891

3878-
FixedVectorType *ParamType =
3879-
cast<FixedVectorType>(I.getArgOperand(0)->getType());
3880-
assert(ParamType == I.getArgOperand(1)->getType());
3892+
FixedVectorType *ParamType = cast<FixedVectorType>(Va->getType());
3893+
assert(ParamType == Vb->getType());
38813894

38823895
assert(ParamType->getPrimitiveSizeInBits() ==
38833896
ReturnType->getPrimitiveSizeInBits());
38843897

3898+
if (I.arg_size() == 3) {
3899+
assert(ParamType == ReturnType);
3900+
assert(ParamType == I.getArgOperand(0)->getType());
3901+
}
3902+
38853903
FixedVectorType *ImplicitReturnType = ReturnType;
38863904
// Step 1: instrument multiplication of corresponding vector elements
38873905
if (EltSizeInBits) {
@@ -3944,10 +3962,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
39443962
Constant::getNullValue(Horizontal->getType())),
39453963
ImplicitReturnType);
39463964

3947-
// For MMX, cast it back to the required fake return type (<1 x i64>).
3965+
// Cast it back to the required fake return type (<1 x i64>).
39483966
if (EltSizeInBits)
39493967
OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
39503968

3969+
// Step 3 (if applicable): instrument accumulator
3970+
if (I.arg_size() == 3)
3971+
OutShadow = IRB.CreateOr(OutShadow, getShadow(&I, 0));
3972+
39513973
setShadow(&I, OutShadow);
39523974
setOriginForNaryOp(I);
39533975
}
@@ -5507,6 +5529,143 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
55075529
handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
55085530
break;
55095531

5532+
// AVX Vector Neural Network Instructions: bytes
5533+
//
5534+
// Multiply and Add Packed Signed and Unsigned Bytes
5535+
// < 4 x i32> @llvm.x86.avx512.vpdpbusd.128
5536+
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
5537+
// < 8 x i32> @llvm.x86.avx512.vpdpbusd.256
5538+
// (< 8 x i32>, < 8 x i32>, < 8 x i32>)
5539+
// <16 x i32> @llvm.x86.avx512.vpdpbusd.512
5540+
// (<16 x i32>, <16 x i32>, <16 x i32>)
5541+
//
5542+
// Multiply and Add Unsigned and Signed Bytes With Saturation
5543+
// < 4 x i32> @llvm.x86.avx512.vpdpbusds.128
5544+
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
5545+
// < 8 x i32> @llvm.x86.avx512.vpdpbusds.256
5546+
// (< 8 x i32>, < 8 x i32>, < 8 x i32>)
5547+
// <16 x i32> @llvm.x86.avx512.vpdpbusds.512
5548+
// (<16 x i32>, <16 x i32>, <16 x i32>)
5549+
//
5550+
// < 4 x i32> @llvm.x86.avx2.vpdpbssd.128
5551+
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
5552+
// < 8 x i32> @llvm.x86.avx2.vpdpbssd.256
5553+
// (< 8 x i32>, < 8 x i32>, < 8 x i32>)
5554+
//
5555+
// < 4 x i32> @llvm.x86.avx2.vpdpbssds.128
5556+
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
5557+
// < 8 x i32> @llvm.x86.avx2.vpdpbssds.256
5558+
// (< 8 x i32>, < 8 x i32>, < 8 x i32>)
5559+
//
5560+
// <16 x i32> @llvm.x86.avx10.vpdpbssd.512
5561+
// (<16 x i32>, <16 x i32>, <16 x i32>)
5562+
// <16 x i32> @llvm.x86.avx10.vpdpbssds.512
5563+
// (<16 x i32>, <16 x i32>, <16 x i32>)
5564+
//
5565+
// These intrinsics are auto-upgraded into non-masked forms:
5566+
// <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128
5567+
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
5568+
// <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128
5569+
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
5570+
// <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256
5571+
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
5572+
// <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256
5573+
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
5574+
// <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512
5575+
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
5576+
// <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512
5577+
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
5578+
//
5579+
// <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128
5580+
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
5581+
// <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128
5582+
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
5583+
// <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256
5584+
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
5585+
// <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256
5586+
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
5587+
// <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512
5588+
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
5589+
// <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512
5590+
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
5591+
case Intrinsic::x86_avx512_vpdpbusd_128:
5592+
case Intrinsic::x86_avx512_vpdpbusd_256:
5593+
case Intrinsic::x86_avx512_vpdpbusd_512:
5594+
case Intrinsic::x86_avx512_vpdpbusds_128:
5595+
case Intrinsic::x86_avx512_vpdpbusds_256:
5596+
case Intrinsic::x86_avx512_vpdpbusds_512:
5597+
case Intrinsic::x86_avx2_vpdpbssd_128:
5598+
case Intrinsic::x86_avx2_vpdpbssd_256:
5599+
case Intrinsic::x86_avx2_vpdpbssds_128:
5600+
case Intrinsic::x86_avx2_vpdpbssds_256:
5601+
case Intrinsic::x86_avx10_vpdpbssd_512:
5602+
case Intrinsic::x86_avx10_vpdpbssds_512:
5603+
handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8);
5604+
break;
5605+
5606+
// AVX Vector Neural Network Instructions: words
5607+
//
5608+
// Multiply and Add Signed Word Integers
5609+
// < 4 x i32> @llvm.x86.avx512.vpdpwssd.128
5610+
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
5611+
// < 8 x i32> @llvm.x86.avx512.vpdpwssd.256
5612+
// (< 8 x i32>, < 8 x i32>, < 8 x i32>)
5613+
// <16 x i32> @llvm.x86.avx512.vpdpwssd.512
5614+
// (<16 x i32>, <16 x i32>, <16 x i32>)
5615+
//
5616+
// Multiply and Add Signed Word Integers With Saturation
5617+
// < 4 x i32> @llvm.x86.avx512.vpdpwssds.128
5618+
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
5619+
// < 8 x i32> @llvm.x86.avx512.vpdpwssds.256
5620+
// (< 8 x i32>, < 8 x i32>, < 8 x i32>)
5621+
// <16 x i32> @llvm.x86.avx512.vpdpwssds.512
5622+
// (<16 x i32>, <16 x i32>, <16 x i32>)
5623+
//
5624+
// These intrinsics are auto-upgraded into non-masked forms:
5625+
// <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128
5626+
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
5627+
// <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128
5628+
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
5629+
// <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256
5630+
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
5631+
// <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256
5632+
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
5633+
// <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512
5634+
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
5635+
// <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512
5636+
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
5637+
//
5638+
// <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128
5639+
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
5640+
// <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128
5641+
// (<4 x i32>, <4 x i32>, <4 x i32>, i8)
5642+
// <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256
5643+
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
5644+
// <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256
5645+
// (<8 x i32>, <8 x i32>, <8 x i32>, i8)
5646+
// <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512
5647+
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
5648+
// <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512
5649+
// (<16 x i32>, <16 x i32>, <16 x i32>, i16)
5650+
case Intrinsic::x86_avx512_vpdpwssd_128:
5651+
case Intrinsic::x86_avx512_vpdpwssd_256:
5652+
case Intrinsic::x86_avx512_vpdpwssd_512:
5653+
case Intrinsic::x86_avx512_vpdpwssds_128:
5654+
case Intrinsic::x86_avx512_vpdpwssds_256:
5655+
case Intrinsic::x86_avx512_vpdpwssds_512:
5656+
handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
5657+
break;
5658+
5659+
// TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single
5660+
// Precision
5661+
// <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128
5662+
// (<4 x float>, <8 x bfloat>, <8 x bfloat>)
5663+
// <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256
5664+
// (<8 x float>, <16 x bfloat>, <16 x bfloat>)
5665+
// <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512
5666+
// (<16 x float>, <32 x bfloat>, <32 x bfloat>)
5667+
// handleVectorPmaddIntrinsic() currently only handles integer types.
5668+
55105669
case Intrinsic::x86_sse_cmp_ss:
55115670
case Intrinsic::x86_sse2_cmp_sd:
55125671
case Intrinsic::x86_sse_comieq_ss:

llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll

Lines changed: 64 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,7 @@
77
; - llvm.x86.avx10.vdpphps.512
88
; - llvm.x86.avx10.vmpsadbw.512
99
;
10-
; Handled heuristically:
11-
; - llvm.x86.avx10.vpdpbssd.512
12-
; - llvm.x86.avx10.vpdpbssds.512
13-
; - llvm.x86.avx10.vpdpbsud.512
14-
; - llvm.x86.avx10.vpdpbsuds.512
15-
; - llvm.x86.avx10.vpdpbuud.512
16-
; - llvm.x86.avx10.vpdpbuuds.512
17-
; - llvm.x86.avx10.vpdpwsud.512
18-
; - llvm.x86.avx10.vpdpwsuds.512
19-
; - llvm.x86.avx10.vpdpwusd.512
20-
; - llvm.x86.avx10.vpdpwusds.512
21-
; - llvm.x86.avx10.vpdpwuud.512
22-
; - llvm.x86.avx10.vpdpwuuds.512
10+
; Handled heuristically: (none)
2311

2412
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
2513
target triple = "x86_64-unknown-linux-gnu"
@@ -140,8 +128,8 @@ define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
140128
; CHECK-LABEL: define <16 x i32> @test_mm512_dpbssd_epi32(
141129
; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
142130
; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
143-
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
144131
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
132+
; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
145133
; CHECK-NEXT: call void @llvm.donothing()
146134
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
147135
; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -154,8 +142,26 @@ define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
154142
; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
155143
; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
156144
; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
157-
; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
158-
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
145+
; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[__A]] to <64 x i8>
146+
; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[__B]] to <64 x i8>
147+
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
148+
; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <64 x i8>
149+
; CHECK-NEXT: [[TMP13:%.*]] = icmp ne <64 x i8> [[TMP11]], zeroinitializer
150+
; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <64 x i8> [[TMP12]], zeroinitializer
151+
; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <64 x i8> [[TMP9]], zeroinitializer
152+
; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <64 x i8> [[TMP10]], zeroinitializer
153+
; CHECK-NEXT: [[TMP17:%.*]] = and <64 x i1> [[TMP13]], [[TMP14]]
154+
; CHECK-NEXT: [[TMP18:%.*]] = and <64 x i1> [[TMP15]], [[TMP14]]
155+
; CHECK-NEXT: [[TMP19:%.*]] = and <64 x i1> [[TMP13]], [[TMP16]]
156+
; CHECK-NEXT: [[TMP20:%.*]] = or <64 x i1> [[TMP17]], [[TMP18]]
157+
; CHECK-NEXT: [[TMP21:%.*]] = or <64 x i1> [[TMP20]], [[TMP19]]
158+
; CHECK-NEXT: [[TMP22:%.*]] = sext <64 x i1> [[TMP21]] to <64 x i8>
159+
; CHECK-NEXT: [[TMP23:%.*]] = bitcast <64 x i8> [[TMP22]] to <32 x i16>
160+
; CHECK-NEXT: [[TMP24:%.*]] = icmp ne <32 x i16> [[TMP23]], zeroinitializer
161+
; CHECK-NEXT: [[TMP25:%.*]] = sext <32 x i1> [[TMP24]] to <32 x i16>
162+
; CHECK-NEXT: [[TMP26:%.*]] = bitcast <32 x i16> [[TMP25]] to i512
163+
; CHECK-NEXT: [[TMP27:%.*]] = bitcast i512 [[TMP26]] to <16 x i32>
164+
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP27]], [[TMP4]]
159165
; CHECK-NEXT: [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
160166
; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
161167
; CHECK-NEXT: ret <16 x i32> [[RES]]
@@ -168,13 +174,31 @@ define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
168174
define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
169175
; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpbssds_epi32(
170176
; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
171-
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
172177
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
173178
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
179+
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
174180
; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
175181
; CHECK-NEXT: call void @llvm.donothing()
176-
; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
177-
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
182+
; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i32> [[__A]] to <64 x i8>
183+
; CHECK-NEXT: [[TMP25:%.*]] = bitcast <16 x i32> [[__B]] to <64 x i8>
184+
; CHECK-NEXT: [[TMP26:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
185+
; CHECK-NEXT: [[TMP27:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
186+
; CHECK-NEXT: [[TMP28:%.*]] = icmp ne <64 x i8> [[TMP26]], zeroinitializer
187+
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP27]], zeroinitializer
188+
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer
189+
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <64 x i8> [[TMP25]], zeroinitializer
190+
; CHECK-NEXT: [[TMP13:%.*]] = and <64 x i1> [[TMP28]], [[TMP10]]
191+
; CHECK-NEXT: [[TMP14:%.*]] = and <64 x i1> [[TMP11]], [[TMP10]]
192+
; CHECK-NEXT: [[TMP15:%.*]] = and <64 x i1> [[TMP28]], [[TMP12]]
193+
; CHECK-NEXT: [[TMP16:%.*]] = or <64 x i1> [[TMP13]], [[TMP14]]
194+
; CHECK-NEXT: [[TMP17:%.*]] = or <64 x i1> [[TMP16]], [[TMP15]]
195+
; CHECK-NEXT: [[TMP18:%.*]] = sext <64 x i1> [[TMP17]] to <64 x i8>
196+
; CHECK-NEXT: [[TMP19:%.*]] = bitcast <64 x i8> [[TMP18]] to <32 x i16>
197+
; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <32 x i16> [[TMP19]], zeroinitializer
198+
; CHECK-NEXT: [[TMP21:%.*]] = sext <32 x i1> [[TMP20]] to <32 x i16>
199+
; CHECK-NEXT: [[TMP22:%.*]] = bitcast <32 x i16> [[TMP21]] to i512
200+
; CHECK-NEXT: [[TMP23:%.*]] = bitcast i512 [[TMP22]] to <16 x i32>
201+
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP23]], [[TMP1]]
178202
; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
179203
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
180204
; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
@@ -196,13 +220,31 @@ define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %_
196220
define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
197221
; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpbssd_epi32(
198222
; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
199-
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
200223
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
201224
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
225+
; CHECK-NEXT: [[TMP24:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
202226
; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
203227
; CHECK-NEXT: call void @llvm.donothing()
204-
; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
205-
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
228+
; CHECK-NEXT: [[TMP25:%.*]] = bitcast <16 x i32> [[__A]] to <64 x i8>
229+
; CHECK-NEXT: [[TMP26:%.*]] = bitcast <16 x i32> [[__B]] to <64 x i8>
230+
; CHECK-NEXT: [[TMP27:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
231+
; CHECK-NEXT: [[TMP28:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
232+
; CHECK-NEXT: [[TMP29:%.*]] = icmp ne <64 x i8> [[TMP27]], zeroinitializer
233+
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP28]], zeroinitializer
234+
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP25]], zeroinitializer
235+
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <64 x i8> [[TMP26]], zeroinitializer
236+
; CHECK-NEXT: [[TMP13:%.*]] = and <64 x i1> [[TMP29]], [[TMP10]]
237+
; CHECK-NEXT: [[TMP14:%.*]] = and <64 x i1> [[TMP11]], [[TMP10]]
238+
; CHECK-NEXT: [[TMP15:%.*]] = and <64 x i1> [[TMP29]], [[TMP12]]
239+
; CHECK-NEXT: [[TMP16:%.*]] = or <64 x i1> [[TMP13]], [[TMP14]]
240+
; CHECK-NEXT: [[TMP17:%.*]] = or <64 x i1> [[TMP16]], [[TMP15]]
241+
; CHECK-NEXT: [[TMP18:%.*]] = sext <64 x i1> [[TMP17]] to <64 x i8>
242+
; CHECK-NEXT: [[TMP19:%.*]] = bitcast <64 x i8> [[TMP18]] to <32 x i16>
243+
; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <32 x i16> [[TMP19]], zeroinitializer
244+
; CHECK-NEXT: [[TMP21:%.*]] = sext <32 x i1> [[TMP20]] to <32 x i16>
245+
; CHECK-NEXT: [[TMP22:%.*]] = bitcast <32 x i16> [[TMP21]] to i512
246+
; CHECK-NEXT: [[TMP23:%.*]] = bitcast i512 [[TMP22]] to <16 x i32>
247+
; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP23]], [[TMP24]]
206248
; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
207249
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
208250
; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>

0 commit comments

Comments
 (0)