diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index df31f07097f82..54d9a832d0f66 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4769,6 +4769,79 @@ struct MemorySanitizerVisitor : public InstVisitor { setOriginForNaryOp(I); } + // Approximately handle AVX Galois Field Affine Transformation + // + // e.g., + // <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8) + // <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8) + // <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8) + // Out A x b + // where A and x are packed matrices, b is a vector, + // Out = A * x + b in GF(2) + // + // Multiplication in GF(2) is equivalent to bitwise AND. However, the matrix + // computation also includes a parity calculation. + // + // For the bitwise AND of bits V1 and V2, the exact shadow is: + // Out_Shadow = (V1_Shadow & V2_Shadow) + // | (V1 & V2_Shadow) + // | (V1_Shadow & V2 ) + // + // We approximate the shadow of gf2p8affineqb using: + // Out_Shadow = gf2p8affineqb(x_Shadow, A_shadow, 0) + // | gf2p8affineqb(x, A_shadow, 0) + // | gf2p8affineqb(x_Shadow, A, 0) + // | set1_epi8(b_Shadow) + // + // This approximation has false negatives: if an intermediate dot-product + // contains an even number of 1's, the parity is 0. + // It has no false positives. + void handleAVXGF2P8Affine(IntrinsicInst &I) { + IRBuilder<> IRB(&I); + + assert(I.arg_size() == 3); + Value *A = I.getOperand(0); + Value *X = I.getOperand(1); + Value *B = I.getOperand(2); + + assert(isFixedIntVector(A)); + assert(cast(A->getType()) + ->getElementType() + ->getScalarSizeInBits() == 8); + + assert(A->getType() == X->getType()); + + assert(B->getType()->isIntegerTy()); + assert(B->getType()->getScalarSizeInBits() == 8); + + assert(I.getType() == A->getType()); + + Value *AShadow = getShadow(A); + Value *XShadow = getShadow(X); + Value *BZeroShadow = getCleanShadow(B); + + CallInst *AShadowXShadow = IRB.CreateIntrinsic( + I.getType(), I.getIntrinsicID(), {XShadow, AShadow, BZeroShadow}); + CallInst *AShadowX = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(), + {X, AShadow, BZeroShadow}); + CallInst *XShadowA = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(), + {XShadow, A, BZeroShadow}); + + unsigned NumElements = cast(I.getType())->getNumElements(); + Value *BShadow = getShadow(B); + Value *BBroadcastShadow = getCleanShadow(AShadow); + // There is no LLVM IR intrinsic for _mm512_set1_epi8. + // This loop generates a lot of LLVM IR, which we expect that CodeGen will + // lower appropriately (e.g., VPBROADCASTB). + // Besides, b is often a constant, in which case it is fully initialized. + for (unsigned i = 0; i < NumElements; i++) + BBroadcastShadow = IRB.CreateInsertElement(BBroadcastShadow, BShadow, i); + + setShadow(&I, IRB.CreateOr( + {AShadowXShadow, AShadowX, XShadowA, BBroadcastShadow})); + setOriginForNaryOp(I); + } + // Handle Arm NEON vector load intrinsics (vld*). // // The WithLane instructions (ld[234]lane) are similar to: @@ -5604,6 +5677,13 @@ struct MemorySanitizerVisitor : public InstVisitor { break; } + // AVX Galois Field New Instructions + case Intrinsic::x86_vgf2p8affineqb_128: + case Intrinsic::x86_vgf2p8affineqb_256: + case Intrinsic::x86_vgf2p8affineqb_512: + handleAVXGF2P8Affine(I); + break; + case Intrinsic::fshl: case Intrinsic::fshr: handleFunnelShift(I); diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll index e5e4371c525b2..43da02d19693c 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll @@ -7,9 +7,6 @@ ; - llvm.x86.vgf2p8affineinvqb.128 ; - llvm.x86.vgf2p8affineinvqb.256 ; - llvm.x86.vgf2p8affineinvqb.512 -; - llvm.x86.vgf2p8affineqb.128 -; - llvm.x86.vgf2p8affineqb.256 -; - llvm.x86.vgf2p8affineqb.512 ; ; Heuristically handled: ; - llvm.x86.vgf2p8mulb.128 @@ -254,53 +251,42 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %s ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]], i8 3) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP13]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] -; CHECK: 14: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 15: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[TMP3]], <16 x i8> [[TMP2]], i8 0) +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC2:%.*]], <16 x i8> [[TMP2]], i8 0) +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[TMP3]], <16 x i8> [[SRC1:%.*]], i8 0) +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i8> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i8> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i8> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 3) +; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[TMP3]], <16 x i8> [[TMP2]], i8 0) +; CHECK-NEXT: [[TMP15:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC2]], <16 x i8> [[TMP2]], i8 0) +; CHECK-NEXT: [[TMP20:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[TMP3]], <16 x i8> [[SRC1]], i8 0) +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i8> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i8> [[TMP17]], [[TMP20]] +; CHECK-NEXT: [[TMP19:%.*]] = or <16 x i8> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 4) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP17]], 0 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP18]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]] -; CHECK: 19: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 20: +; CHECK-NEXT: [[TMP37:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[TMP3]], <16 x i8> [[TMP2]], i8 0) +; CHECK-NEXT: [[TMP39:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC2]], <16 x i8> [[TMP2]], i8 0) +; CHECK-NEXT: [[TMP40:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[TMP3]], <16 x i8> [[SRC1]], i8 0) +; CHECK-NEXT: [[TMP41:%.*]] = or <16 x i8> [[TMP37]], [[TMP39]] +; CHECK-NEXT: [[TMP42:%.*]] = or <16 x i8> [[TMP41]], [[TMP40]] +; CHECK-NEXT: [[TMP43:%.*]] = or <16 x i8> [[TMP42]], zeroinitializer ; CHECK-NEXT: [[TMP21:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 5) -; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP19]], <16 x i8> zeroinitializer ; CHECK-NEXT: [[TMP23:%.*]] = xor <16 x i8> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i8> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i8> [[TMP23]], [[TMP19]] ; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i8> [[TMP24]], zeroinitializer ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP25]], <16 x i8> [[TMP22]] ; CHECK-NEXT: [[TMP26:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP16]], <16 x i8> zeroinitializer -; CHECK-NEXT: [[TMP27:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP27:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP43]], <16 x i8> [[TMP4]] ; CHECK-NEXT: [[TMP28:%.*]] = xor <16 x i8> [[TMP21]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP29:%.*]] = or <16 x i8> [[TMP28]], zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = or <16 x i8> [[TMP28]], [[TMP43]] ; CHECK-NEXT: [[TMP30:%.*]] = or <16 x i8> [[TMP29]], [[TMP4]] ; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP30]], <16 x i8> [[TMP27]] ; CHECK-NEXT: [[TMP31:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP21]], <16 x i8> [[PASSTHRU]] +; CHECK-NEXT: [[TMP38:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } { <16 x i8> splat (i8 -1), <16 x i8> splat (i8 -1), <16 x i8> splat (i8 -1) }, <16 x i8> [[TMP12]], 0 ; CHECK-NEXT: [[TMP32:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> [[TMP11]], 0 -; CHECK-NEXT: [[TMP33:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } { <16 x i8> zeroinitializer, <16 x i8> splat (i8 -1), <16 x i8> splat (i8 -1) }, <16 x i8> [[_MSPROP_SELECT]], 1 +; CHECK-NEXT: [[TMP33:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP38]], <16 x i8> [[_MSPROP_SELECT]], 1 ; CHECK-NEXT: [[TMP34:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP32]], <16 x i8> [[TMP26]], 1 ; CHECK-NEXT: [[TMP35:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP33]], <16 x i8> [[_MSPROP_SELECT1]], 2 ; CHECK-NEXT: [[TMP36:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP34]], <16 x i8> [[TMP31]], 2 @@ -329,53 +315,42 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %s ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <32 x i8> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <32 x i8> [[TMP3]] to i256 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]], i8 3) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <32 x i8> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP12]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <32 x i8> [[TMP3]] to i256 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP13]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] -; CHECK: 14: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 15: +; CHECK-NEXT: [[TMP7:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[TMP3]], <32 x i8> [[TMP2]], i8 0) +; CHECK-NEXT: [[TMP8:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC2:%.*]], <32 x i8> [[TMP2]], i8 0) +; CHECK-NEXT: [[TMP9:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[TMP3]], <32 x i8> [[SRC1:%.*]], i8 0) +; CHECK-NEXT: [[TMP10:%.*]] = or <32 x i8> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = or <32 x i8> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <32 x i8> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 3) +; CHECK-NEXT: [[TMP14:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[TMP3]], <32 x i8> [[TMP2]], i8 0) +; CHECK-NEXT: [[TMP15:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC2]], <32 x i8> [[TMP2]], i8 0) +; CHECK-NEXT: [[TMP20:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[TMP3]], <32 x i8> [[SRC1]], i8 0) +; CHECK-NEXT: [[TMP17:%.*]] = or <32 x i8> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or <32 x i8> [[TMP17]], [[TMP20]] +; CHECK-NEXT: [[TMP19:%.*]] = or <32 x i8> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 4) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast <32 x i8> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i256 [[TMP17]], 0 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast <32 x i8> [[TMP3]] to i256 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP18]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]] -; CHECK: 19: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 20: +; CHECK-NEXT: [[TMP37:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[TMP3]], <32 x i8> [[TMP2]], i8 0) +; CHECK-NEXT: [[TMP39:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC2]], <32 x i8> [[TMP2]], i8 0) +; CHECK-NEXT: [[TMP40:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[TMP3]], <32 x i8> [[SRC1]], i8 0) +; CHECK-NEXT: [[TMP41:%.*]] = or <32 x i8> [[TMP37]], [[TMP39]] +; CHECK-NEXT: [[TMP42:%.*]] = or <32 x i8> [[TMP41]], [[TMP40]] +; CHECK-NEXT: [[TMP43:%.*]] = or <32 x i8> [[TMP42]], zeroinitializer ; CHECK-NEXT: [[TMP21:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 5) -; CHECK-NEXT: [[TMP22:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> zeroinitializer, <32 x i8> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP19]], <32 x i8> zeroinitializer ; CHECK-NEXT: [[TMP23:%.*]] = xor <32 x i8> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = or <32 x i8> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <32 x i8> [[TMP23]], [[TMP19]] ; CHECK-NEXT: [[TMP25:%.*]] = or <32 x i8> [[TMP24]], zeroinitializer ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP25]], <32 x i8> [[TMP22]] ; CHECK-NEXT: [[TMP26:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP16]], <32 x i8> zeroinitializer -; CHECK-NEXT: [[TMP27:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> zeroinitializer, <32 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP27:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP43]], <32 x i8> [[TMP4]] ; CHECK-NEXT: [[TMP28:%.*]] = xor <32 x i8> [[TMP21]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP29:%.*]] = or <32 x i8> [[TMP28]], zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = or <32 x i8> [[TMP28]], [[TMP43]] ; CHECK-NEXT: [[TMP30:%.*]] = or <32 x i8> [[TMP29]], [[TMP4]] ; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP30]], <32 x i8> [[TMP27]] ; CHECK-NEXT: [[TMP31:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP21]], <32 x i8> [[PASSTHRU]] +; CHECK-NEXT: [[TMP38:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } { <32 x i8> splat (i8 -1), <32 x i8> splat (i8 -1), <32 x i8> splat (i8 -1) }, <32 x i8> [[TMP12]], 0 ; CHECK-NEXT: [[TMP32:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } poison, <32 x i8> [[TMP11]], 0 -; CHECK-NEXT: [[TMP33:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } { <32 x i8> zeroinitializer, <32 x i8> splat (i8 -1), <32 x i8> splat (i8 -1) }, <32 x i8> [[_MSPROP_SELECT]], 1 +; CHECK-NEXT: [[TMP33:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP38]], <32 x i8> [[_MSPROP_SELECT]], 1 ; CHECK-NEXT: [[TMP34:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP32]], <32 x i8> [[TMP26]], 1 ; CHECK-NEXT: [[TMP35:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP33]], <32 x i8> [[_MSPROP_SELECT1]], 2 ; CHECK-NEXT: [[TMP36:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP34]], <32 x i8> [[TMP31]], 2 @@ -404,53 +379,42 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %s ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64 [[TMP1]] to <64 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <64 x i8> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <64 x i8> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]], i8 3) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <64 x i8> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP12]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <64 x i8> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP13]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] -; CHECK: 14: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 15: +; CHECK-NEXT: [[TMP7:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[TMP3]], <64 x i8> [[TMP2]], i8 0) +; CHECK-NEXT: [[TMP8:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC2:%.*]], <64 x i8> [[TMP2]], i8 0) +; CHECK-NEXT: [[TMP9:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[TMP3]], <64 x i8> [[SRC1:%.*]], i8 0) +; CHECK-NEXT: [[TMP10:%.*]] = or <64 x i8> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = or <64 x i8> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <64 x i8> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 3) +; CHECK-NEXT: [[TMP14:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[TMP3]], <64 x i8> [[TMP2]], i8 0) +; CHECK-NEXT: [[TMP15:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC2]], <64 x i8> [[TMP2]], i8 0) +; CHECK-NEXT: [[TMP20:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[TMP3]], <64 x i8> [[SRC1]], i8 0) +; CHECK-NEXT: [[TMP17:%.*]] = or <64 x i8> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or <64 x i8> [[TMP17]], [[TMP20]] +; CHECK-NEXT: [[TMP19:%.*]] = or <64 x i8> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 4) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast <64 x i8> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP17]], 0 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast <64 x i8> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP18]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]] -; CHECK: 19: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 20: +; CHECK-NEXT: [[TMP37:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[TMP3]], <64 x i8> [[TMP2]], i8 0) +; CHECK-NEXT: [[TMP39:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC2]], <64 x i8> [[TMP2]], i8 0) +; CHECK-NEXT: [[TMP40:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[TMP3]], <64 x i8> [[SRC1]], i8 0) +; CHECK-NEXT: [[TMP41:%.*]] = or <64 x i8> [[TMP37]], [[TMP39]] +; CHECK-NEXT: [[TMP42:%.*]] = or <64 x i8> [[TMP41]], [[TMP40]] +; CHECK-NEXT: [[TMP43:%.*]] = or <64 x i8> [[TMP42]], zeroinitializer ; CHECK-NEXT: [[TMP21:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 5) -; CHECK-NEXT: [[TMP22:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> zeroinitializer, <64 x i8> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP19]], <64 x i8> zeroinitializer ; CHECK-NEXT: [[TMP23:%.*]] = xor <64 x i8> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = or <64 x i8> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <64 x i8> [[TMP23]], [[TMP19]] ; CHECK-NEXT: [[TMP25:%.*]] = or <64 x i8> [[TMP24]], zeroinitializer ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP25]], <64 x i8> [[TMP22]] ; CHECK-NEXT: [[TMP26:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP16]], <64 x i8> zeroinitializer -; CHECK-NEXT: [[TMP27:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> zeroinitializer, <64 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP27:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP43]], <64 x i8> [[TMP4]] ; CHECK-NEXT: [[TMP28:%.*]] = xor <64 x i8> [[TMP21]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP29:%.*]] = or <64 x i8> [[TMP28]], zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = or <64 x i8> [[TMP28]], [[TMP43]] ; CHECK-NEXT: [[TMP30:%.*]] = or <64 x i8> [[TMP29]], [[TMP4]] ; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP30]], <64 x i8> [[TMP27]] ; CHECK-NEXT: [[TMP31:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP21]], <64 x i8> [[PASSTHRU]] +; CHECK-NEXT: [[TMP38:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } { <64 x i8> splat (i8 -1), <64 x i8> splat (i8 -1), <64 x i8> splat (i8 -1) }, <64 x i8> [[TMP12]], 0 ; CHECK-NEXT: [[TMP32:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } poison, <64 x i8> [[TMP11]], 0 -; CHECK-NEXT: [[TMP33:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } { <64 x i8> zeroinitializer, <64 x i8> splat (i8 -1), <64 x i8> splat (i8 -1) }, <64 x i8> [[_MSPROP_SELECT]], 1 +; CHECK-NEXT: [[TMP33:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP38]], <64 x i8> [[_MSPROP_SELECT]], 1 ; CHECK-NEXT: [[TMP34:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP32]], <64 x i8> [[TMP26]], 1 ; CHECK-NEXT: [[TMP35:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP33]], <64 x i8> [[_MSPROP_SELECT1]], 2 ; CHECK-NEXT: [[TMP36:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP34]], <64 x i8> [[TMP31]], 2