Skip to content

Conversation

@thurstond
Copy link
Contributor

This adds a handler, visitGenericScalarHalfwordInst, which works for mask.{add/sub/mul/div/max/min}.sh.round.

Updates the tests in #136260

…}.sh.round

This adds a handler, visitGenericScalarHalfwordInst, which works for
mask.{add/sub/mul/div/max/min}.sh.round.

Updates the tests in llvm#136260
@llvmbot
Copy link
Member

llvmbot commented Apr 26, 2025

@llvm/pr-subscribers-compiler-rt-sanitizer

Author: Thurston Dang (thurstond)

Changes

This adds a handler, visitGenericScalarHalfwordInst, which works for mask.{add/sub/mul/div/max/min}.sh.round.

Updates the tests in #136260


Patch is 47.87 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137441.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp (+69)
  • (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll (+237-220)
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 8e31e8d2a4fbd..9f4708e14aa6a 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4312,6 +4312,65 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
+  // For sh compiler intrinsics:
+  // llvm.x86.avx512fp16.mask.{add/sub/mul/div/max/min}.sh.round
+  //   (<8 x half>, <8 x half>, <8 x half>, i8,  i32)
+  //    A           B           WriteThru   Mask RoundingMode
+  //
+  // if (Mask[0])
+  //   DstShadow[0] = AShadow[0] | BShadow[0]
+  // else
+  //   DstShadow[0] = WriteThruShadow[0]
+  //
+  // DstShadow[1..7] = AShadow[1..7]
+  void visitGenericScalarHalfwordInst(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+
+    assert(I.arg_size() == 5);
+    Value *A = I.getOperand(0);
+    Value *B = I.getOperand(1);
+    Value *WriteThrough = I.getOperand(2);
+    Value *Mask = I.getOperand(3);
+    Value *RoundingMode = I.getOperand(4);
+
+    // Technically, we could probably just check whether the LSB is initialized
+    insertShadowCheck(Mask, &I);
+    insertShadowCheck(RoundingMode, &I);
+
+    assert(isa<FixedVectorType>(A->getType()));
+    unsigned NumElements =
+        cast<FixedVectorType>(A->getType())->getNumElements();
+    assert(NumElements == 8);
+    assert(A->getType() == B->getType());
+    assert(B->getType() == WriteThrough->getType());
+    assert(Mask->getType()->getPrimitiveSizeInBits() == NumElements);
+    assert(RoundingMode->getType()->isIntegerTy());
+
+    Mask = IRB.CreateBitCast(
+        Mask, FixedVectorType::get(IRB.getInt1Ty(), NumElements));
+
+    Value *AShadow = getShadow(A);
+    Value *BShadow = getShadow(B);
+    Value *ABLowerShadow =
+        IRB.CreateOr(IRB.CreateExtractElement(
+                         AShadow, ConstantInt::get(IRB.getInt32Ty(), 0)),
+                     IRB.CreateExtractElement(
+                         BShadow, ConstantInt::get(IRB.getInt32Ty(), 0)));
+    Value *WriteThroughShadow = getShadow(WriteThrough);
+    Value *WriteThroughLowerShadow = IRB.CreateExtractElement(
+        WriteThroughShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
+
+    Value *DstLowerShadow = IRB.CreateSelect(
+        IRB.CreateExtractElement(Mask, ConstantInt::get(IRB.getInt32Ty(), 0)),
+        ABLowerShadow, WriteThroughLowerShadow);
+    Value *DstShadow = IRB.CreateInsertElement(
+        AShadow, DstLowerShadow, ConstantInt::get(IRB.getInt32Ty(), 0),
+        "_msprop");
+
+    setShadow(&I, DstShadow);
+    setOriginForNaryOp(I);
+  }
+
   // Handle Arm NEON vector load intrinsics (vld*).
   //
   // The WithLane instructions (ld[234]lane) are similar to:
@@ -5041,6 +5100,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       break;
     }
 
+    case Intrinsic::x86_avx512fp16_mask_add_sh_round:
+    case Intrinsic::x86_avx512fp16_mask_sub_sh_round:
+    case Intrinsic::x86_avx512fp16_mask_mul_sh_round:
+    case Intrinsic::x86_avx512fp16_mask_div_sh_round:
+    case Intrinsic::x86_avx512fp16_mask_max_sh_round:
+    case Intrinsic::x86_avx512fp16_mask_min_sh_round: {
+      visitGenericScalarHalfwordInst(I);
+      break;
+    }
+
     case Intrinsic::fshl:
     case Intrinsic::fshr:
       handleFunnelShift(I);
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll
index 61a32e5e2042e..b11b21da492d2 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll
@@ -13,9 +13,6 @@
 ; - llvm.x86.avx512fp16.mask.getexp.sh
 ; - llvm.x86.avx512fp16.mask.getmant.ph.512
 ; - llvm.x86.avx512fp16.mask.getmant.sh
-; - llvm.x86.avx512fp16.mask.max.sh.round
-; - llvm.x86.avx512fp16.mask.min.sh.round
-; - llvm.x86.avx512fp16.mask.mul.sh.round
 ; - llvm.x86.avx512fp16.mask.rcp.ph.512
 ; - llvm.x86.avx512fp16.mask.rcp.sh
 ; - llvm.x86.avx512fp16.mask.reduce.ph.512
@@ -27,7 +24,6 @@
 ; - llvm.x86.avx512fp16.mask.scalef.ph.512
 ; - llvm.x86.avx512fp16.mask.scalef.sh
 ; - llvm.x86.avx512fp16.mask.sqrt.sh
-; - llvm.x86.avx512fp16.mask.sub.sh.round
 ; - llvm.x86.avx512fp16.mask.vcvtph2dq.512
 ; - llvm.x86.avx512fp16.mask.vcvtph2qq.512
 ; - llvm.x86.avx512fp16.mask.vcvtph2udq.512
@@ -1393,8 +1389,8 @@ define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(<8 x half> %x1, <8 x half
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -1409,54 +1405,57 @@ define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(<8 x half> %x1, <8 x half
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[_MSLD]], i32 0
 ; CHECK-NEXT:    [[VAL:%.*]] = insertelement <8 x half> poison, half [[VAL_HALF]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
-; CHECK:       [[BB13]]:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = or i16 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 true, i16 [[TMP13]], i16 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0
 ; CHECK-NEXT:    [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4)
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP15]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i16> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP16]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0
 ; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
-; CHECK:       [[BB17]]:
+; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB22:.*]], label %[[BB23:.*]], !prof [[PROF1]]
+; CHECK:       [[BB22]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB18]]:
+; CHECK:       [[BB23]]:
 ; CHECK-NEXT:    [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4)
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP19]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0
+; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0
 ; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSCMP8]], [[_MSCMP9]]
-; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB20:.*]], label %[[BB21:.*]], !prof [[PROF1]]
-; CHECK:       [[BB20]]:
+; CHECK-NEXT:    br i1 [[_MSCMP9]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]]
+; CHECK:       [[BB30]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB21]]:
+; CHECK:       [[BB31]]:
 ; CHECK-NEXT:    [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4)
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i16> [[_MSPROP]] to i128
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP22]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i16> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP23]], 0
-; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0
+; CHECK-NEXT:    [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0
 ; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-; CHECK-NEXT:    br i1 [[_MSOR15]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]]
-; CHECK:       [[BB24]]:
+; CHECK-NEXT:    br i1 [[_MSCMP14]], label %[[BB39:.*]], label %[[BB40:.*]], !prof [[PROF1]]
+; CHECK:       [[BB39]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB25]]:
+; CHECK:       [[BB40]]:
 ; CHECK-NEXT:    [[RES3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[RES2]], <8 x half> [[VAL]], <8 x half> [[SRC]], i8 [[MASK]], i32 4)
-; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x half> [[RES3]]
 ;
   %val.half = load half,ptr %ptr
@@ -1476,8 +1475,8 @@ define <8 x half> @test_int_x86_avx512fp16_mask_sub_sh(<8 x half> %x1, <8 x half
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -1492,54 +1491,57 @@ define <8 x half> @test_int_x86_avx512fp16_mask_sub_sh(<8 x half> %x1, <8 x half
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[_MSLD]], i32 0
 ; CHECK-NEXT:    [[VAL:%.*]] = insertelement <8 x half> poison, half [[VAL_HALF]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
-; CHECK:       [[BB13]]:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = or i16 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 true, i16 [[TMP13]], i16 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0
 ; CHECK-NEXT:    [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4)
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP15]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i16> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP16]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0
 ; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
-; CHECK:       [[BB17]]:
+; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB22:.*]], label %[[BB23:.*]], !prof [[PROF1]]
+; CHECK:       [[BB22]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB18]]:
+; CHECK:       [[BB23]]:
 ; CHECK-NEXT:    [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4)
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP19]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0
+; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0
 ; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSCMP8]], [[_MSCMP9]]
-; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB20:.*]], label %[[BB21:.*]], !prof [[PROF1]]
-; CHECK:       [[BB20]]:
+; CHECK-NEXT:    br i1 [[_MSCMP9]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]]
+; CHECK:       [[BB30]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB21]]:
+; CHECK:       [[BB31]]:
 ; CHECK-NEXT:    [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4)
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i16> [[_MSPROP]] to i128
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP22]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i16> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP23]], 0
-; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0
+; CHECK-NEXT:    [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0
 ; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-; CHECK-NEXT:    br i1 [[_MSOR15]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]]
-; CHECK:       [[BB24]]:
+; CHECK-NEXT:    br i1 [[_MSCMP14]], label %[[BB39:.*]], label %[[BB40:.*]], !prof [[PROF1]]
+; CHECK:       [[BB39]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB25]]:
+; CHECK:       [[BB40]]:
 ; CHECK-NEXT:    [[RES3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[RES2]], <8 x half> [[VAL]], <8 x half> [[SRC]], i8 [[MASK]], i32 4)
-; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x half> [[RES3]]
 ;
   %val.half = load half,ptr %ptr
@@ -1559,8 +1561,8 @@ define <8 x half> @test_int_x86_avx512fp16_mask_mul_sh(<8 x half> %x1, <8 x half
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -1575,54 +1577,57 @@ define <8 x half> @test_int_x86_avx512fp16_mask_mul_sh(<8 x half> %x1, <8 x half
 ; CHECK-NEXT:    [[_MSLD:%.*]] =...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Apr 26, 2025

@llvm/pr-subscribers-llvm-transforms

Author: Thurston Dang (thurstond)

Changes

This adds a handler, visitGenericScalarHalfwordInst, which works for mask.{add/sub/mul/div/max/min}.sh.round.

Updates the tests in #136260


Patch is 47.87 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137441.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp (+69)
  • (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll (+237-220)
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 8e31e8d2a4fbd..9f4708e14aa6a 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4312,6 +4312,65 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
+  // For sh compiler intrinsics:
+  // llvm.x86.avx512fp16.mask.{add/sub/mul/div/max/min}.sh.round
+  //   (<8 x half>, <8 x half>, <8 x half>, i8,  i32)
+  //    A           B           WriteThru   Mask RoundingMode
+  //
+  // if (Mask[0])
+  //   DstShadow[0] = AShadow[0] | BShadow[0]
+  // else
+  //   DstShadow[0] = WriteThruShadow[0]
+  //
+  // DstShadow[1..7] = AShadow[1..7]
+  void visitGenericScalarHalfwordInst(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+
+    assert(I.arg_size() == 5);
+    Value *A = I.getOperand(0);
+    Value *B = I.getOperand(1);
+    Value *WriteThrough = I.getOperand(2);
+    Value *Mask = I.getOperand(3);
+    Value *RoundingMode = I.getOperand(4);
+
+    // Technically, we could probably just check whether the LSB is initialized
+    insertShadowCheck(Mask, &I);
+    insertShadowCheck(RoundingMode, &I);
+
+    assert(isa<FixedVectorType>(A->getType()));
+    unsigned NumElements =
+        cast<FixedVectorType>(A->getType())->getNumElements();
+    assert(NumElements == 8);
+    assert(A->getType() == B->getType());
+    assert(B->getType() == WriteThrough->getType());
+    assert(Mask->getType()->getPrimitiveSizeInBits() == NumElements);
+    assert(RoundingMode->getType()->isIntegerTy());
+
+    Mask = IRB.CreateBitCast(
+        Mask, FixedVectorType::get(IRB.getInt1Ty(), NumElements));
+
+    Value *AShadow = getShadow(A);
+    Value *BShadow = getShadow(B);
+    Value *ABLowerShadow =
+        IRB.CreateOr(IRB.CreateExtractElement(
+                         AShadow, ConstantInt::get(IRB.getInt32Ty(), 0)),
+                     IRB.CreateExtractElement(
+                         BShadow, ConstantInt::get(IRB.getInt32Ty(), 0)));
+    Value *WriteThroughShadow = getShadow(WriteThrough);
+    Value *WriteThroughLowerShadow = IRB.CreateExtractElement(
+        WriteThroughShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
+
+    Value *DstLowerShadow = IRB.CreateSelect(
+        IRB.CreateExtractElement(Mask, ConstantInt::get(IRB.getInt32Ty(), 0)),
+        ABLowerShadow, WriteThroughLowerShadow);
+    Value *DstShadow = IRB.CreateInsertElement(
+        AShadow, DstLowerShadow, ConstantInt::get(IRB.getInt32Ty(), 0),
+        "_msprop");
+
+    setShadow(&I, DstShadow);
+    setOriginForNaryOp(I);
+  }
+
   // Handle Arm NEON vector load intrinsics (vld*).
   //
   // The WithLane instructions (ld[234]lane) are similar to:
@@ -5041,6 +5100,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       break;
     }
 
+    case Intrinsic::x86_avx512fp16_mask_add_sh_round:
+    case Intrinsic::x86_avx512fp16_mask_sub_sh_round:
+    case Intrinsic::x86_avx512fp16_mask_mul_sh_round:
+    case Intrinsic::x86_avx512fp16_mask_div_sh_round:
+    case Intrinsic::x86_avx512fp16_mask_max_sh_round:
+    case Intrinsic::x86_avx512fp16_mask_min_sh_round: {
+      visitGenericScalarHalfwordInst(I);
+      break;
+    }
+
     case Intrinsic::fshl:
     case Intrinsic::fshr:
       handleFunnelShift(I);
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll
index 61a32e5e2042e..b11b21da492d2 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll
@@ -13,9 +13,6 @@
 ; - llvm.x86.avx512fp16.mask.getexp.sh
 ; - llvm.x86.avx512fp16.mask.getmant.ph.512
 ; - llvm.x86.avx512fp16.mask.getmant.sh
-; - llvm.x86.avx512fp16.mask.max.sh.round
-; - llvm.x86.avx512fp16.mask.min.sh.round
-; - llvm.x86.avx512fp16.mask.mul.sh.round
 ; - llvm.x86.avx512fp16.mask.rcp.ph.512
 ; - llvm.x86.avx512fp16.mask.rcp.sh
 ; - llvm.x86.avx512fp16.mask.reduce.ph.512
@@ -27,7 +24,6 @@
 ; - llvm.x86.avx512fp16.mask.scalef.ph.512
 ; - llvm.x86.avx512fp16.mask.scalef.sh
 ; - llvm.x86.avx512fp16.mask.sqrt.sh
-; - llvm.x86.avx512fp16.mask.sub.sh.round
 ; - llvm.x86.avx512fp16.mask.vcvtph2dq.512
 ; - llvm.x86.avx512fp16.mask.vcvtph2qq.512
 ; - llvm.x86.avx512fp16.mask.vcvtph2udq.512
@@ -1393,8 +1389,8 @@ define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(<8 x half> %x1, <8 x half
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -1409,54 +1405,57 @@ define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(<8 x half> %x1, <8 x half
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[_MSLD]], i32 0
 ; CHECK-NEXT:    [[VAL:%.*]] = insertelement <8 x half> poison, half [[VAL_HALF]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
-; CHECK:       [[BB13]]:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = or i16 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 true, i16 [[TMP13]], i16 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0
 ; CHECK-NEXT:    [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4)
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP15]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i16> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP16]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0
 ; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
-; CHECK:       [[BB17]]:
+; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB22:.*]], label %[[BB23:.*]], !prof [[PROF1]]
+; CHECK:       [[BB22]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB18]]:
+; CHECK:       [[BB23]]:
 ; CHECK-NEXT:    [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4)
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP19]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0
+; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0
 ; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSCMP8]], [[_MSCMP9]]
-; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB20:.*]], label %[[BB21:.*]], !prof [[PROF1]]
-; CHECK:       [[BB20]]:
+; CHECK-NEXT:    br i1 [[_MSCMP9]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]]
+; CHECK:       [[BB30]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB21]]:
+; CHECK:       [[BB31]]:
 ; CHECK-NEXT:    [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4)
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i16> [[_MSPROP]] to i128
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP22]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i16> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP23]], 0
-; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0
+; CHECK-NEXT:    [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0
 ; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-; CHECK-NEXT:    br i1 [[_MSOR15]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]]
-; CHECK:       [[BB24]]:
+; CHECK-NEXT:    br i1 [[_MSCMP14]], label %[[BB39:.*]], label %[[BB40:.*]], !prof [[PROF1]]
+; CHECK:       [[BB39]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB25]]:
+; CHECK:       [[BB40]]:
 ; CHECK-NEXT:    [[RES3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[RES2]], <8 x half> [[VAL]], <8 x half> [[SRC]], i8 [[MASK]], i32 4)
-; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x half> [[RES3]]
 ;
   %val.half = load half,ptr %ptr
@@ -1476,8 +1475,8 @@ define <8 x half> @test_int_x86_avx512fp16_mask_sub_sh(<8 x half> %x1, <8 x half
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -1492,54 +1491,57 @@ define <8 x half> @test_int_x86_avx512fp16_mask_sub_sh(<8 x half> %x1, <8 x half
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[_MSLD]], i32 0
 ; CHECK-NEXT:    [[VAL:%.*]] = insertelement <8 x half> poison, half [[VAL_HALF]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
-; CHECK:       [[BB13]]:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = or i16 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 true, i16 [[TMP13]], i16 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP14]], i32 0
 ; CHECK-NEXT:    [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[X1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 -1, i32 4)
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP15]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i16> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP16]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i16> [[_MSPROP1]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = or i16 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i16 [[TMP18]], i16 [[TMP19]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP21]], i32 0
 ; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i8 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
-; CHECK:       [[BB17]]:
+; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB22:.*]], label %[[BB23:.*]], !prof [[PROF1]]
+; CHECK:       [[BB22]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB18]]:
+; CHECK:       [[BB23]]:
 ; CHECK-NEXT:    [[RES1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[RES0]], <8 x half> [[X2]], <8 x half> [[SRC]], i8 [[MASK]], i32 4)
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP19]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <8 x i16> [[_MSPROP2]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = or i16 [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i1> [[TMP24]], i32 0
+; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i16 [[TMP27]], i16 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP29]], i32 0
 ; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSCMP8]], [[_MSCMP9]]
-; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB20:.*]], label %[[BB21:.*]], !prof [[PROF1]]
-; CHECK:       [[BB20]]:
+; CHECK-NEXT:    br i1 [[_MSCMP9]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]]
+; CHECK:       [[BB30]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB21]]:
+; CHECK:       [[BB31]]:
 ; CHECK-NEXT:    [[RES2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[RES1]], <8 x half> [[X2]], <8 x half> zeroinitializer, i8 [[MASK]], i32 4)
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i16> [[_MSPROP]] to i128
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP22]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i16> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP23]], 0
-; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i16> [[_MSPROP3]], i32 0
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <8 x i16> [[_MSPROP]], i32 0
+; CHECK-NEXT:    [[TMP35:%.*]] = or i16 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i16 [[TMP35]], i16 [[TMP36]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP38]], i32 0
 ; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i8 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-; CHECK-NEXT:    br i1 [[_MSOR15]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]]
-; CHECK:       [[BB24]]:
+; CHECK-NEXT:    br i1 [[_MSCMP14]], label %[[BB39:.*]], label %[[BB40:.*]], !prof [[PROF1]]
+; CHECK:       [[BB39]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB25]]:
+; CHECK:       [[BB40]]:
 ; CHECK-NEXT:    [[RES3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[RES2]], <8 x half> [[VAL]], <8 x half> [[SRC]], i8 [[MASK]], i32 4)
-; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x half> [[RES3]]
 ;
   %val.half = load half,ptr %ptr
@@ -1559,8 +1561,8 @@ define <8 x half> @test_int_x86_avx512fp16_mask_mul_sh(<8 x half> %x1, <8 x half
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -1575,54 +1577,57 @@ define <8 x half> @test_int_x86_avx512fp16_mask_mul_sh(<8 x half> %x1, <8 x half
 ; CHECK-NEXT:    [[_MSLD:%.*]] =...
[truncated]

@fmayer
Copy link
Contributor

fmayer commented Apr 28, 2025

Relevant test is failing on windows bot

@thurstond
Copy link
Contributor Author

Relevant test is failing on windows bot

Argh, evaluation order got me again. Fixed in 33ee854

@thurstond thurstond requested a review from fmayer April 28, 2025 22:34
@github-actions
Copy link

github-actions bot commented Apr 28, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@fmayer
Copy link
Contributor

fmayer commented Apr 28, 2025

Please press F to format code

@thurstond
Copy link
Contributor Author

Please press F to format code

Done

@thurstond thurstond merged commit d913ea3 into llvm:main Apr 29, 2025
11 checks passed
IanWood1 pushed a commit to IanWood1/llvm-project that referenced this pull request May 6, 2025
…}.sh.round (llvm#137441)

This adds a handler, visitGenericScalarHalfwordInst, which works for
mask.{add/sub/mul/div/max/min}.sh.round.

Updates the tests in llvm#136260
GeorgeARM pushed a commit to GeorgeARM/llvm-project that referenced this pull request May 7, 2025
…}.sh.round (llvm#137441)

This adds a handler, visitGenericScalarHalfwordInst, which works for
mask.{add/sub/mul/div/max/min}.sh.round.

Updates the tests in llvm#136260
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants