Skip to content

Commit ddd49c6

Browse files
authored
[msan] Handle AVX512/AVX10 rcp and rsqrt (#158397)
Adds a new handler, handleAVX512VectorGenericMaskedFP(), and applies it to AVX512/AVX10 rcp and rsqrt
1 parent d012642 commit ddd49c6

File tree

4 files changed

+381
-266
lines changed

4 files changed

+381
-266
lines changed

llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4911,6 +4911,69 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
49114911
setOriginForNaryOp(I);
49124912
}
49134913

4914+
// Handle llvm.x86.avx512.* instructions that take a vector of floating-point
4915+
// values and perform an operation whose shadow propagation should be handled
4916+
// as all-or-nothing [*], with masking provided by a vector and a mask
4917+
// supplied as an integer.
4918+
//
4919+
// [*] if all bits of a vector element are initialized, the output is fully
4920+
// initialized; otherwise, the output is fully uninitialized
4921+
//
4922+
// e.g., <16 x float> @llvm.x86.avx512.rsqrt14.ps.512
4923+
// (<16 x float>, <16 x float>, i16)
4924+
// A WriteThru Mask
4925+
//
4926+
// <2 x double> @llvm.x86.avx512.rcp14.pd.128
4927+
// (<2 x double>, <2 x double>, i8)
4928+
//
4929+
// Dst[i] = Mask[i] ? some_op(A[i]) : WriteThru[i]
4930+
// Dst_shadow[i] = Mask[i] ? all_or_nothing(A_shadow[i]) : WriteThru_shadow[i]
4931+
void handleAVX512VectorGenericMaskedFP(IntrinsicInst &I) {
4932+
IRBuilder<> IRB(&I);
4933+
4934+
assert(I.arg_size() == 3);
4935+
Value *A = I.getOperand(0);
4936+
Value *WriteThrough = I.getOperand(1);
4937+
Value *Mask = I.getOperand(2);
4938+
4939+
assert(isFixedFPVector(A));
4940+
assert(isFixedFPVector(WriteThrough));
4941+
4942+
[[maybe_unused]] unsigned ANumElements =
4943+
cast<FixedVectorType>(A->getType())->getNumElements();
4944+
unsigned OutputNumElements =
4945+
cast<FixedVectorType>(WriteThrough->getType())->getNumElements();
4946+
assert(ANumElements == OutputNumElements);
4947+
4948+
assert(Mask->getType()->isIntegerTy());
4949+
// Some bits of the mask might be unused, but check them all anyway
4950+
// (typically the mask is an integer constant).
4951+
insertCheckShadowOf(Mask, &I);
4952+
4953+
// The mask has 1 bit per element of A, but a minimum of 8 bits.
4954+
if (Mask->getType()->getScalarSizeInBits() == 8 && ANumElements < 8)
4955+
Mask = IRB.CreateTrunc(Mask, Type::getIntNTy(*MS.C, ANumElements));
4956+
assert(Mask->getType()->getScalarSizeInBits() == ANumElements);
4957+
4958+
assert(I.getType() == WriteThrough->getType());
4959+
4960+
Mask = IRB.CreateBitCast(
4961+
Mask, FixedVectorType::get(IRB.getInt1Ty(), OutputNumElements));
4962+
4963+
Value *AShadow = getShadow(A);
4964+
4965+
// All-or-nothing shadow
4966+
AShadow = IRB.CreateSExt(IRB.CreateICmpNE(AShadow, getCleanShadow(AShadow)),
4967+
AShadow->getType());
4968+
4969+
Value *WriteThroughShadow = getShadow(WriteThrough);
4970+
4971+
Value *Shadow = IRB.CreateSelect(Mask, AShadow, WriteThroughShadow);
4972+
setShadow(&I, Shadow);
4973+
4974+
setOriginForNaryOp(I);
4975+
}
4976+
49144977
// For sh.* compiler intrinsics:
49154978
// llvm.x86.avx512fp16.mask.{add/sub/mul/div/max/min}.sh.round
49164979
// (<8 x half>, <8 x half>, <8 x half>, i8, i32)
@@ -6091,6 +6154,108 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
60916154
break;
60926155
}
60936156

6157+
// AVX512/AVX10 Reciprocal
6158+
// <16 x float> @llvm.x86.avx512.rsqrt14.ps.512
6159+
// (<16 x float>, <16 x float>, i16)
6160+
// <8 x float> @llvm.x86.avx512.rsqrt14.ps.256
6161+
// (<8 x float>, <8 x float>, i8)
6162+
// <4 x float> @llvm.x86.avx512.rsqrt14.ps.128
6163+
// (<4 x float>, <4 x float>, i8)
6164+
//
6165+
// <8 x double> @llvm.x86.avx512.rsqrt14.pd.512
6166+
// (<8 x double>, <8 x double>, i8)
6167+
// <4 x double> @llvm.x86.avx512.rsqrt14.pd.256
6168+
// (<4 x double>, <4 x double>, i8)
6169+
// <2 x double> @llvm.x86.avx512.rsqrt14.pd.128
6170+
// (<2 x double>, <2 x double>, i8)
6171+
//
6172+
// <32 x bfloat> @llvm.x86.avx10.mask.rsqrt.bf16.512
6173+
// (<32 x bfloat>, <32 x bfloat>, i32)
6174+
// <16 x bfloat> @llvm.x86.avx10.mask.rsqrt.bf16.256
6175+
// (<16 x bfloat>, <16 x bfloat>, i16)
6176+
// <8 x bfloat> @llvm.x86.avx10.mask.rsqrt.bf16.128
6177+
// (<8 x bfloat>, <8 x bfloat>, i8)
6178+
//
6179+
// <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512
6180+
// (<32 x half>, <32 x half>, i32)
6181+
// <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256
6182+
// (<16 x half>, <16 x half>, i16)
6183+
// <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128
6184+
// (<8 x half>, <8 x half>, i8)
6185+
//
6186+
// TODO: 3-operand variants are not handled:
6187+
// <2 x double> @llvm.x86.avx512.rsqrt14.sd
6188+
// (<2 x double>, <2 x double>, <2 x double>, i8)
6189+
// <4 x float> @llvm.x86.avx512.rsqrt14.ss
6190+
// (<4 x float>, <4 x float>, <4 x float>, i8)
6191+
// <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh
6192+
// (<8 x half>, <8 x half>, <8 x half>, i8)
6193+
case Intrinsic::x86_avx512_rsqrt14_ps_512:
6194+
case Intrinsic::x86_avx512_rsqrt14_ps_256:
6195+
case Intrinsic::x86_avx512_rsqrt14_ps_128:
6196+
case Intrinsic::x86_avx512_rsqrt14_pd_512:
6197+
case Intrinsic::x86_avx512_rsqrt14_pd_256:
6198+
case Intrinsic::x86_avx512_rsqrt14_pd_128:
6199+
case Intrinsic::x86_avx10_mask_rsqrt_bf16_512:
6200+
case Intrinsic::x86_avx10_mask_rsqrt_bf16_256:
6201+
case Intrinsic::x86_avx10_mask_rsqrt_bf16_128:
6202+
case Intrinsic::x86_avx512fp16_mask_rsqrt_ph_512:
6203+
case Intrinsic::x86_avx512fp16_mask_rsqrt_ph_256:
6204+
case Intrinsic::x86_avx512fp16_mask_rsqrt_ph_128:
6205+
handleAVX512VectorGenericMaskedFP(I);
6206+
break;
6207+
6208+
// AVX512/AVX10 Reciprocal Square Root
6209+
// <16 x float> @llvm.x86.avx512.rcp14.ps.512
6210+
// (<16 x float>, <16 x float>, i16)
6211+
// <8 x float> @llvm.x86.avx512.rcp14.ps.256
6212+
// (<8 x float>, <8 x float>, i8)
6213+
// <4 x float> @llvm.x86.avx512.rcp14.ps.128
6214+
// (<4 x float>, <4 x float>, i8)
6215+
//
6216+
// <8 x double> @llvm.x86.avx512.rcp14.pd.512
6217+
// (<8 x double>, <8 x double>, i8)
6218+
// <4 x double> @llvm.x86.avx512.rcp14.pd.256
6219+
// (<4 x double>, <4 x double>, i8)
6220+
// <2 x double> @llvm.x86.avx512.rcp14.pd.128
6221+
// (<2 x double>, <2 x double>, i8)
6222+
//
6223+
// <32 x bfloat> @llvm.x86.avx10.mask.rcp.bf16.512
6224+
// (<32 x bfloat>, <32 x bfloat>, i32)
6225+
// <16 x bfloat> @llvm.x86.avx10.mask.rcp.bf16.256
6226+
// (<16 x bfloat>, <16 x bfloat>, i16)
6227+
// <8 x bfloat> @llvm.x86.avx10.mask.rcp.bf16.128
6228+
// (<8 x bfloat>, <8 x bfloat>, i8)
6229+
//
6230+
// <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512
6231+
// (<32 x half>, <32 x half>, i32)
6232+
// <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256
6233+
// (<16 x half>, <16 x half>, i16)
6234+
// <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128
6235+
// (<8 x half>, <8 x half>, i8)
6236+
//
6237+
// TODO: 3-operand variants are not handled:
6238+
// <2 x double> @llvm.x86.avx512.rcp14.sd
6239+
// (<2 x double>, <2 x double>, <2 x double>, i8)
6240+
// <4 x float> @llvm.x86.avx512.rcp14.ss
6241+
// (<4 x float>, <4 x float>, <4 x float>, i8)
6242+
// <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh
6243+
// (<8 x half>, <8 x half>, <8 x half>, i8)
6244+
case Intrinsic::x86_avx512_rcp14_ps_512:
6245+
case Intrinsic::x86_avx512_rcp14_ps_256:
6246+
case Intrinsic::x86_avx512_rcp14_ps_128:
6247+
case Intrinsic::x86_avx512_rcp14_pd_512:
6248+
case Intrinsic::x86_avx512_rcp14_pd_256:
6249+
case Intrinsic::x86_avx512_rcp14_pd_128:
6250+
case Intrinsic::x86_avx10_mask_rcp_bf16_512:
6251+
case Intrinsic::x86_avx10_mask_rcp_bf16_256:
6252+
case Intrinsic::x86_avx10_mask_rcp_bf16_128:
6253+
case Intrinsic::x86_avx512fp16_mask_rcp_ph_512:
6254+
case Intrinsic::x86_avx512fp16_mask_rcp_ph_256:
6255+
case Intrinsic::x86_avx512fp16_mask_rcp_ph_128:
6256+
handleAVX512VectorGenericMaskedFP(I);
6257+
break;
6258+
60946259
// AVX512 FP16 Arithmetic
60956260
case Intrinsic::x86_avx512fp16_mask_add_sh_round:
60966261
case Intrinsic::x86_avx512fp16_mask_sub_sh_round:

llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll

Lines changed: 12 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,6 @@
2828
; - llvm.x86.avx512.mul.pd.512, llvm.x86.avx512.mul.ps.512
2929
; - llvm.x86.avx512.permvar.df.512, llvm.x86.avx512.permvar.sf.512
3030
; - llvm.x86.avx512.pternlog.d.512, llvm.x86.avx512.pternlog.q.512
31-
; - llvm.x86.avx512.rcp14.pd.512, llvm.x86.avx512.rcp14.ps.512
32-
; - llvm.x86.avx512.rsqrt14.ps.512
3331
; - llvm.x86.avx512.sitofp.round.v16f32.v16i32
3432
; - llvm.x86.avx512.sqrt.pd.512, llvm.x86.avx512.sqrt.ps.512
3533
; - llvm.x86.avx512.sub.ps.512
@@ -682,15 +680,11 @@ define <16 x float> @test_rcp_ps_512(<16 x float> %a0) #0 {
682680
; CHECK-LABEL: @test_rcp_ps_512(
683681
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
684682
; CHECK-NEXT: call void @llvm.donothing()
685-
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
686-
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
687-
; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
688-
; CHECK: 3:
689-
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
690-
; CHECK-NEXT: unreachable
691-
; CHECK: 4:
683+
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
684+
; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i1> [[TMP2]] to <16 x i32>
685+
; CHECK-NEXT: [[TMP4:%.*]] = select <16 x i1> splat (i1 true), <16 x i32> [[TMP3]], <16 x i32> zeroinitializer
692686
; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1)
693-
; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
687+
; CHECK-NEXT: store <16 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
694688
; CHECK-NEXT: ret <16 x float> [[RES]]
695689
;
696690
%res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
@@ -702,15 +696,11 @@ define <8 x double> @test_rcp_pd_512(<8 x double> %a0) #0 {
702696
; CHECK-LABEL: @test_rcp_pd_512(
703697
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
704698
; CHECK-NEXT: call void @llvm.donothing()
705-
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
706-
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
707-
; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
708-
; CHECK: 3:
709-
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
710-
; CHECK-NEXT: unreachable
711-
; CHECK: 4:
699+
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <8 x i64> [[TMP1]], zeroinitializer
700+
; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i64>
701+
; CHECK-NEXT: [[TMP4:%.*]] = select <8 x i1> splat (i1 true), <8 x i64> [[TMP3]], <8 x i64> zeroinitializer
712702
; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> [[A0:%.*]], <8 x double> zeroinitializer, i8 -1)
713-
; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
703+
; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
714704
; CHECK-NEXT: ret <8 x double> [[RES]]
715705
;
716706
%res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1]
@@ -1021,15 +1011,11 @@ define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) #0 {
10211011
; CHECK-LABEL: @test_rsqrt_ps_512(
10221012
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
10231013
; CHECK-NEXT: call void @llvm.donothing()
1024-
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
1025-
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
1026-
; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
1027-
; CHECK: 3:
1028-
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
1029-
; CHECK-NEXT: unreachable
1030-
; CHECK: 4:
1014+
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
1015+
; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i1> [[TMP2]] to <16 x i32>
1016+
; CHECK-NEXT: [[TMP4:%.*]] = select <16 x i1> splat (i1 true), <16 x i32> [[TMP3]], <16 x i32> zeroinitializer
10311017
; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1)
1032-
; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
1018+
; CHECK-NEXT: store <16 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
10331019
; CHECK-NEXT: ret <16 x float> [[RES]]
10341020
;
10351021
%res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]

llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
; - llvm.x86.avx512fp16.mask.reduce.sh
2020
; - llvm.x86.avx512fp16.mask.rndscale.ph.512
2121
; - llvm.x86.avx512fp16.mask.rndscale.sh
22-
; - llvm.x86.avx512fp16.mask.rsqrt.ph.512
2322
; - llvm.x86.avx512fp16.mask.rsqrt.sh
2423
; - llvm.x86.avx512fp16.mask.scalef.ph.512
2524
; - llvm.x86.avx512fp16.mask.scalef.sh
@@ -442,15 +441,11 @@ define <32 x half> @test_rsqrt_ph_512(<32 x half> %a0) #0 {
442441
; CHECK-SAME: <32 x half> [[A0:%.*]]) #[[ATTR1]] {
443442
; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
444443
; CHECK-NEXT: call void @llvm.donothing()
445-
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
446-
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
447-
; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
448-
; CHECK: [[BB3]]:
449-
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
450-
; CHECK-NEXT: unreachable
451-
; CHECK: [[BB4]]:
444+
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
445+
; CHECK-NEXT: [[TMP3:%.*]] = sext <32 x i1> [[TMP2]] to <32 x i16>
446+
; CHECK-NEXT: [[TMP4:%.*]] = select <32 x i1> splat (i1 true), <32 x i16> [[TMP3]], <32 x i16> zeroinitializer
452447
; CHECK-NEXT: [[RES:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> [[A0]], <32 x half> zeroinitializer, i32 -1)
453-
; CHECK-NEXT: store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
448+
; CHECK-NEXT: store <32 x i16> [[TMP4]], ptr @__msan_retval_tls, align 8
454449
; CHECK-NEXT: ret <32 x half> [[RES]]
455450
;
456451
%res = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 -1)
@@ -681,24 +676,22 @@ declare <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half>, <32 x half
681676
define <32 x half> @test_rcp_ph_512(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
682677
; CHECK-LABEL: define <32 x half> @test_rcp_ph_512(
683678
; CHECK-SAME: <32 x half> [[A0:%.*]], <32 x half> [[A1:%.*]], i32 [[MASK:%.*]]) #[[ATTR1]] {
679+
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
684680
; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
685681
; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
686-
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
687682
; CHECK-NEXT: call void @llvm.donothing()
688-
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
689-
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
690-
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
691-
; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
692-
; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
683+
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[MASK]] to <32 x i1>
684+
; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
685+
; CHECK-NEXT: [[TMP6:%.*]] = sext <32 x i1> [[TMP5]] to <32 x i16>
686+
; CHECK-NEXT: [[TMP7:%.*]] = select <32 x i1> [[TMP4]], <32 x i16> [[TMP6]], <32 x i16> [[TMP2]]
693687
; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i32 [[TMP3]], 0
694-
; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
695-
; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
696-
; CHECK: [[BB6]]:
688+
; CHECK-NEXT: br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
689+
; CHECK: [[BB8]]:
697690
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
698691
; CHECK-NEXT: unreachable
699-
; CHECK: [[BB7]]:
692+
; CHECK: [[BB9]]:
700693
; CHECK-NEXT: [[RES:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> [[A0]], <32 x half> [[A1]], i32 [[MASK]])
701-
; CHECK-NEXT: store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
694+
; CHECK-NEXT: store <32 x i16> [[TMP7]], ptr @__msan_retval_tls, align 8
702695
; CHECK-NEXT: ret <32 x half> [[RES]]
703696
;
704697
%res = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> %a1, i32 %mask)
@@ -3260,3 +3253,6 @@ define <32 x half> @test_mm512_castph256_ph512_freeze(<16 x half> %a0) nounwind
32603253
}
32613254

32623255
attributes #0 = { sanitize_memory }
3256+
;.
3257+
; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
3258+
;.

0 commit comments

Comments
 (0)