diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll new file mode 100644 index 0000000000000..7af8f34d403a0 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll @@ -0,0 +1,773 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mattr=+avx10.2-512 -passes=msan -S | FileCheck %s + +; Forked from llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll +; +; Handled strictly: +; - llvm.x86.avx10.vdpphps.512 +; - llvm.x86.avx10.vmpsadbw.512 +; +; Handled heuristically: +; - llvm.x86.avx10.vpdpbssd.512 +; - llvm.x86.avx10.vpdpbssds.512 +; - llvm.x86.avx10.vpdpbsud.512 +; - llvm.x86.avx10.vpdpbsuds.512 +; - llvm.x86.avx10.vpdpbuud.512 +; - llvm.x86.avx10.vpdpbuuds.512 +; - llvm.x86.avx10.vpdpwsud.512 +; - llvm.x86.avx10.vpdpwsuds.512 +; - llvm.x86.avx10.vpdpwusd.512 +; - llvm.x86.avx10.vpdpwusds.512 +; - llvm.x86.avx10.vpdpwuud.512 +; - llvm.x86.avx10.vpdpwuuds.512 + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define <16 x float> @test_mm512_dpph_ps(<16 x float> %__W, <32 x half> %__A, <32 x half> %__B) sanitize_memory { +; CHECK-LABEL: define <16 x float> @test_mm512_dpph_ps( +; CHECK-SAME: <16 x float> [[__W:%.*]], <32 x half> [[__A:%.*]], <32 x half> [[__B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <32 x i16> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <32 x i16> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[RES:%.*]] = tail call <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float> [[__W]], <32 x half> [[__A]], <32 x half> [[__B]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES]] +; + %res = tail call <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float> %__W, <32 x half> %__A, <32 x half> %__B) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_dpph_ps(<16 x float> %__W, i16 zeroext %__U, <32 x half> %__A, <32 x half> %__B) sanitize_memory { +; CHECK-LABEL: define <16 x float> @test_mm512_mask_dpph_ps( +; CHECK-SAME: <16 x float> [[__W:%.*]], i16 zeroext [[__U:%.*]], <32 x half> [[__A:%.*]], <32 x half> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <32 x i16> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[DPH:%.*]] = tail call <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float> [[__W]], <32 x half> [[__A]], <32 x half> [[__B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[BST]], <16 x i32> zeroinitializer, <16 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[DPH]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[__W]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP16]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[RES:%.*]] = select <16 x i1> [[BST]], <16 x float> [[DPH]], <16 x float> [[__W]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES]] +; + %dph = tail call <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float> %__W, <32 x half> %__A, <32 x half> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x float> %dph, <16 x float> %__W + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_dpph_ps(i16 zeroext %__U, <16 x float> %__W, <32 x half> %__A, <32 x half> %__B) sanitize_memory { +; CHECK-LABEL: define <16 x float> @test_mm512_maskz_dpph_ps( +; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x float> [[__W:%.*]], <32 x half> [[__A:%.*]], <32 x half> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <32 x i16> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[DPH:%.*]] = tail call <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float> [[__W]], <32 x half> [[__A]], <32 x half> [[__B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[BST]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[DPH]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[RES:%.*]] = select <16 x i1> [[BST]], <16 x float> [[DPH]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES]] +; + %dph = tail call <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float> %__W, <32 x half> %__A, <32 x half> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x float> %dph, <16 x float> zeroinitializer + ret <16 x float> %res +} + +declare <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float>, <32 x half>, <32 x half>) + + +define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_mm512_dpbssd_epi32( +; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %__B = load <16 x i32>, ptr %pB + %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpbssds_epi32( +; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[DPI]], [[__W]] +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> [[__W]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpbssd_epi32( +; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[DPI]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32> @test_mm512_dpbsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_mm512_dpbsud_epi32( +; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %__B = load <16 x i32>, ptr %pB + %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_mask_dpbsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpbsuds_epi32( +; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[DPI]], [[__W]] +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> [[__W]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_maskz_dpbsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpbsud_epi32( +; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[DPI]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32> @test_mm512_dpbuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_mm512_dpbuud_epi32( +; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %__B = load <16 x i32>, ptr %pB + %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_mask_dpbuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpbuuds_epi32( +; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[DPI]], [[__W]] +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> [[__W]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_maskz_dpbuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpbuud_epi32( +; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[DPI]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <16 x i32>, <16 x i32>) + + +define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_mm512_dpwsud_epi32( +; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %__B = load <16 x i32>, ptr %pB + %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpwsuds_epi32( +; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[DPI]], [[__W]] +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> [[__W]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpwsud_epi32( +; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[DPI]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_mm512_dpwusd_epi32( +; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %__B = load <16 x i32>, ptr %pB + %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_mask_dpwusds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpwusds_epi32( +; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[DPI]], [[__W]] +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> [[__W]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_maskz_dpwusd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpwusd_epi32( +; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[DPI]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32> @test_mm512_dpwuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_mm512_dpwuud_epi32( +; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %__B = load <16 x i32>, ptr %pB + %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_mask_dpwuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpwuuds_epi32( +; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[DPI]], [[__W]] +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> [[__W]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_maskz_dpwuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpwuud_epi32( +; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[DPI]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <16 x i32>, <16 x i32>) + + +define { <32 x i16>, <32 x i16>, <32 x i16> } @test_mm512_mask_mpsadbw(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) sanitize_memory { +; CHECK-LABEL: define { <32 x i16>, <32 x i16>, <32 x i16> } @test_mm512_mask_mpsadbw( +; CHECK-SAME: <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]], <32 x i16> [[X3:%.*]], i32 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1> +; CHECK-NEXT: [[MSK:%.*]] = bitcast i32 [[X4]] to <32 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <64 x i8> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <64 x i8> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[RS1:%.*]] = call <32 x i16> @llvm.x86.avx10.vmpsadbw.512(<64 x i8> [[X0]], <64 x i8> [[X1]], i8 2) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <64 x i8> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <64 x i8> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]] +; CHECK: [[BB12]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB13]]: +; CHECK-NEXT: [[AD2:%.*]] = call <32 x i16> @llvm.x86.avx10.vmpsadbw.512(<64 x i8> [[X0]], <64 x i8> [[X1]], i8 3) +; CHECK-NEXT: [[TMP14:%.*]] = select <32 x i1> [[MSK]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <32 x i16> [[AD2]], [[X3]] +; CHECK-NEXT: [[TMP16:%.*]] = or <32 x i16> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <32 x i16> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP5]], <32 x i16> [[TMP17]], <32 x i16> [[TMP14]] +; CHECK-NEXT: [[RS2:%.*]] = select <32 x i1> [[MSK]], <32 x i16> [[AD2]], <32 x i16> [[X3]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <64 x i8> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP18]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <64 x i8> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP19]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: br i1 [[_MSOR8]], label %[[BB20:.*]], label %[[BB21:.*]], !prof [[PROF1]] +; CHECK: [[BB20]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB21]]: +; CHECK-NEXT: [[AD3:%.*]] = call <32 x i16> @llvm.x86.avx10.vmpsadbw.512(<64 x i8> [[X0]], <64 x i8> [[X1]], i8 4) +; CHECK-NEXT: [[TMP22:%.*]] = select <32 x i1> [[MSK]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <32 x i16> [[AD3]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <32 x i16> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = or <32 x i16> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <32 x i1> [[TMP5]], <32 x i16> [[TMP25]], <32 x i16> [[TMP22]] +; CHECK-NEXT: [[RS3:%.*]] = select <32 x i1> [[MSK]], <32 x i16> [[AD3]], <32 x i16> zeroinitializer +; CHECK-NEXT: [[RS4:%.*]] = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } poison, <32 x i16> [[RS1]], 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } { <32 x i16> zeroinitializer, <32 x i16> splat (i16 -1), <32 x i16> splat (i16 -1) }, <32 x i16> [[_MSPROP_SELECT]], 1 +; CHECK-NEXT: [[RS5:%.*]] = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } [[RS4]], <32 x i16> [[RS2]], 1 +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } [[TMP26]], <32 x i16> [[_MSPROP_SELECT1]], 2 +; CHECK-NEXT: [[RS6:%.*]] = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } [[RS5]], <32 x i16> [[RS3]], 2 +; CHECK-NEXT: store { <32 x i16>, <32 x i16>, <32 x i16> } [[TMP27]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <32 x i16>, <32 x i16>, <32 x i16> } [[RS6]] +; + %msk = bitcast i32 %x4 to <32 x i1> + %rs1 = call <32 x i16> @llvm.x86.avx10.vmpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i8 2) + %ad2 = call <32 x i16> @llvm.x86.avx10.vmpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i8 3) + %rs2 = select <32 x i1> %msk, <32 x i16> %ad2, <32 x i16> %x3 + %ad3 = call <32 x i16> @llvm.x86.avx10.vmpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i8 4) + %rs3 = select <32 x i1> %msk, <32 x i16> %ad3, <32 x i16> zeroinitializer + %rs4 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } poison, <32 x i16> %rs1, 0 + %rs5 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } %rs4, <32 x i16> %rs2, 1 + %rs6 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } %rs5, <32 x i16> %rs3, 2 + ret { <32 x i16>, <32 x i16>, <32 x i16> } %rs6 +} + +declare <32 x i16> @llvm.x86.avx10.vmpsadbw.512(<64 x i8>, <64 x i8>, i8) + + +define <8 x float> @avx_dp_ps(<8 x float> %a, <8 x float> %b) sanitize_memory { +; CHECK-LABEL: define <8 x float> @avx_dp_ps( +; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = select <8 x i1> , <8 x i32> [[TMP3]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP4]]) +; CHECK-NEXT: [[_MSDPP:%.*]] = icmp eq i32 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[_MSDPP]], <8 x i1> zeroinitializer, <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> , <8 x i32> [[TMP3]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP7]]) +; CHECK-NEXT: [[_MSDPP1:%.*]] = icmp eq i32 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[_MSDPP1]], <8 x i1> zeroinitializer, <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i1> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[_MSDPP2:%.*]] = sext <8 x i1> [[TMP10]] to <8 x i32> +; CHECK-NEXT: [[R:%.*]] = tail call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> [[A]], <8 x float> [[B]], i8 -1) +; CHECK-NEXT: store <8 x i32> [[_MSDPP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[R]] +; + %r = tail call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a, <8 x float> %b, i8 -1) + ret <8 x float> %r +} +;. +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll new file mode 100644 index 0000000000000..5f0b0b39da4d9 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll @@ -0,0 +1,1128 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mattr=+avx10.2-256 -passes=msan -S | FileCheck %s + +; Forked from llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll +; +; Handled strictly: +; - llvm.x86.avx10.vdpphps.128 +; - llvm.x86.avx10.vdpphps.128 +; - llvm.x86.avx10.vdpphps.256 +; - llvm.x86.avx2.mpsadbw +; - llvm.x86.sse41.mpsadbw +; +; Handled heuristically: +; - llvm.x86.avx2.vpdpbssd.128 +; - llvm.x86.avx2.vpdpbssd.256 +; - llvm.x86.avx2.vpdpbssds.128 +; - llvm.x86.avx2.vpdpbssds.256 +; - llvm.x86.avx2.vpdpbsud.128 +; - llvm.x86.avx2.vpdpbsud.256 +; - llvm.x86.avx2.vpdpbsuds.128 +; - llvm.x86.avx2.vpdpbsuds.256 +; - llvm.x86.avx2.vpdpbuud.128 +; - llvm.x86.avx2.vpdpbuud.256 +; - llvm.x86.avx2.vpdpbuuds.128 +; - llvm.x86.avx2.vpdpbuuds.256 +; - llvm.x86.avx2.vpdpwsud.128 +; - llvm.x86.avx2.vpdpwsud.256 +; - llvm.x86.avx2.vpdpwsuds.128 +; - llvm.x86.avx2.vpdpwsuds.256 +; - llvm.x86.avx2.vpdpwusd.128 +; - llvm.x86.avx2.vpdpwusd.256 +; - llvm.x86.avx2.vpdpwusds.128 +; - llvm.x86.avx2.vpdpwusds.256 +; - llvm.x86.avx2.vpdpwuud.128 +; - llvm.x86.avx2.vpdpwuud.256 +; - llvm.x86.avx2.vpdpwuuds.128 +; - llvm.x86.avx2.vpdpwuuds.256 + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define <4 x float> @test_mm_dpph_ps(<4 x float> %__W, <8 x half> %__A, <8 x half> %__B) sanitize_memory { +; CHECK-LABEL: define <4 x float> @test_mm_dpph_ps( +; CHECK-SAME: <4 x float> [[__W:%.*]], <8 x half> [[__A:%.*]], <8 x half> [[__B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float> [[__W]], <8 x half> [[__A]], <8 x half> [[__B]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = tail call <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float> %__W, <8 x half> %__A, <8 x half> %__B) + ret <4 x float> %res +} + +define <4 x float> @test_mm_mask_dpph_ps(<4 x float> %__W, i8 zeroext %__U, <8 x half> %__A, <8 x half> %__B) sanitize_memory { +; CHECK-LABEL: define <4 x float> @test_mm_mask_dpph_ps( +; CHECK-SAME: <4 x float> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x half> [[__A:%.*]], <8 x half> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[DPH:%.*]] = tail call <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float> [[__W]], <8 x half> [[__A]], <8 x half> [[__B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> splat (i1 true), <4 x i32> +; CHECK-NEXT: [[EXT:%.*]] = shufflevector <8 x i1> [[BST]], <8 x i1> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[EXT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x float> [[DPH]] to <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x float> [[__W]] to <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i32> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP16]], <4 x i32> [[TMP11]] +; CHECK-NEXT: [[RES:%.*]] = select <4 x i1> [[EXT]], <4 x float> [[DPH]], <4 x float> [[__W]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %dph = tail call <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float> %__W, <8 x half> %__A, <8 x half> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %ext = shufflevector <8 x i1> %bst, <8 x i1> poison, <4 x i32> + %res = select <4 x i1> %ext, <4 x float> %dph, <4 x float> %__W + ret <4 x float> %res +} + +define <4 x float> @test_mm_maskz_dpph_ps(i8 zeroext %__U, <4 x float> %__W, <8 x half> %__A, <8 x half> %__B) sanitize_memory { +; CHECK-LABEL: define <4 x float> @test_mm_maskz_dpph_ps( +; CHECK-SAME: i8 zeroext [[__U:%.*]], <4 x float> [[__W:%.*]], <8 x half> [[__A:%.*]], <8 x half> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[DPH:%.*]] = tail call <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float> [[__W]], <8 x half> [[__A]], <8 x half> [[__B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> splat (i1 true), <4 x i32> +; CHECK-NEXT: [[EXT:%.*]] = shufflevector <8 x i1> [[BST]], <8 x i1> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[EXT]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x float> [[DPH]] to <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP15]], <4 x i32> [[TMP11]] +; CHECK-NEXT: [[RES:%.*]] = select <4 x i1> [[EXT]], <4 x float> [[DPH]], <4 x float> zeroinitializer +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %dph = tail call <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float> %__W, <8 x half> %__A, <8 x half> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %ext = shufflevector <8 x i1> %bst, <8 x i1> poison, <4 x i32> + %res = select <4 x i1> %ext, <4 x float> %dph, <4 x float> zeroinitializer + ret <4 x float> %res +} + +define <8 x float> @test_mm256_dpph_ps(<8 x float> %__W, <16 x half> %__A, <16 x half> %__B) sanitize_memory { +; CHECK-LABEL: define <8 x float> @test_mm256_dpph_ps( +; CHECK-SAME: <8 x float> [[__W:%.*]], <16 x half> [[__A:%.*]], <16 x half> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i16> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[RES:%.*]] = tail call <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float> [[__W]], <16 x half> [[__A]], <16 x half> [[__B]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[RES]] +; + %res = tail call <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float> %__W, <16 x half> %__A, <16 x half> %__B) + ret <8 x float> %res +} + +define <8 x float> @test_mm256_mask_dpph_ps(<8 x float> %__W, i8 zeroext %__U, <16 x half> %__A, <16 x half> %__B) sanitize_memory { +; CHECK-LABEL: define <8 x float> @test_mm256_mask_dpph_ps( +; CHECK-SAME: <8 x float> [[__W:%.*]], i8 zeroext [[__U:%.*]], <16 x half> [[__A:%.*]], <16 x half> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i16> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[DPH:%.*]] = tail call <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float> [[__W]], <16 x half> [[__A]], <16 x half> [[__B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[BST]], <8 x i32> zeroinitializer, <8 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x float> [[DPH]] to <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x float> [[__W]] to <8 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i32> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP15]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP16]], <8 x i32> [[TMP11]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[BST]], <8 x float> [[DPH]], <8 x float> [[__W]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[RES]] +; + %dph = tail call <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float> %__W, <16 x half> %__A, <16 x half> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x float> %dph, <8 x float> %__W + ret <8 x float> %res +} + +define <8 x float> @test_mm256_maskz_dpph_ps(i8 zeroext %__U, <8 x float> %__W, <16 x half> %__A, <16 x half> %__B) sanitize_memory { +; CHECK-LABEL: define <8 x float> @test_mm256_maskz_dpph_ps( +; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x float> [[__W:%.*]], <16 x half> [[__A:%.*]], <16 x half> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i16> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[DPH:%.*]] = tail call <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float> [[__W]], <16 x half> [[__A]], <16 x half> [[__B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[BST]], <8 x i32> zeroinitializer, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x float> [[DPH]] to <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP15]], <8 x i32> [[TMP11]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[BST]], <8 x float> [[DPH]], <8 x float> zeroinitializer +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[RES]] +; + %dph = tail call <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float> %__W, <16 x half> %__A, <16 x half> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x float> %dph, <8 x float> zeroinitializer + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float>, <8 x half>, <8 x half>) +declare <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float>, <16 x half>, <16 x half>) + + +define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpbssd_epi32( +; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[DPI]], [[__W]] +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP9]], <4 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[DPI]], <4 x i32> [[__W]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W + ret <4 x i32> %res +} + +define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpbssds_epi32( +; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[DPI]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP9]], <4 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[DPI]], <4 x i32> zeroinitializer +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer + ret <4 x i32> %res +} + +define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpbssds_epi32( +; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[DPI]], [[__W]] +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[DPI]], <8 x i32> [[__W]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W + ret <8 x i32> %res +} + +define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpbssd_epi32( +; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[DPI]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[DPI]], <8 x i32> zeroinitializer +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpbsud_epi32( +; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[DPI]], [[__W]] +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP9]], <4 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[DPI]], <4 x i32> [[__W]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W + ret <4 x i32> %res +} + +define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpbsuds_epi32( +; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[DPI]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP9]], <4 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[DPI]], <4 x i32> zeroinitializer +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer + ret <4 x i32> %res +} + +define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpbsuds_epi32( +; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[DPI]], [[__W]] +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[DPI]], <8 x i32> [[__W]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W + ret <8 x i32> %res +} + +define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpbsud_epi32( +; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[DPI]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[DPI]], <8 x i32> zeroinitializer +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpbuud_epi32( +; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[DPI]], [[__W]] +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP9]], <4 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[DPI]], <4 x i32> [[__W]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W + ret <4 x i32> %res +} + +define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpbuuds_epi32( +; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[DPI]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP9]], <4 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[DPI]], <4 x i32> zeroinitializer +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer + ret <4 x i32> %res +} + +define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpbuuds_epi32( +; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[DPI]], [[__W]] +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[DPI]], <8 x i32> [[__W]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W + ret <8 x i32> %res +} + +define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpbuud_epi32( +; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[DPI]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[DPI]], <8 x i32> zeroinitializer +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>) + + +define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpwsud_epi32( +; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[DPI]], [[__W]] +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP9]], <4 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[DPI]], <4 x i32> [[__W]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W + ret <4 x i32> %res +} + +define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpwsuds_epi32( +; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[DPI]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP9]], <4 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[DPI]], <4 x i32> zeroinitializer +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer + ret <4 x i32> %res +} + +define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpwsuds_epi32( +; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[DPI]], [[__W]] +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[DPI]], <8 x i32> [[__W]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W + ret <8 x i32> %res +} + +define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpwsud_epi32( +; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[DPI]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[DPI]], <8 x i32> zeroinitializer +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpwusd_epi32( +; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[DPI]], [[__W]] +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP9]], <4 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[DPI]], <4 x i32> [[__W]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W + ret <4 x i32> %res +} + +define <4 x i32> @test_mm_maskz_dpwusds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpwusds_epi32( +; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[DPI]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP9]], <4 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[DPI]], <4 x i32> zeroinitializer +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer + ret <4 x i32> %res +} + +define <8 x i32> @test_mm256_maskz_dpwusds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpwusds_epi32( +; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[DPI]], [[__W]] +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[DPI]], <8 x i32> [[__W]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W + ret <8 x i32> %res +} + +define <8 x i32> @test_mm256_mask_dpwusd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpwusd_epi32( +; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[DPI]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[DPI]], <8 x i32> zeroinitializer +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <4 x i32> @test_mm_mask_dpwuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpwuud_epi32( +; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[DPI]], [[__W]] +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP9]], <4 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[DPI]], <4 x i32> [[__W]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W + ret <4 x i32> %res +} + +define <4 x i32> @test_mm_maskz_dpwuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpwuuds_epi32( +; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[DPI]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP9]], <4 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[DPI]], <4 x i32> zeroinitializer +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer + ret <4 x i32> %res +} + +define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpwuuds_epi32( +; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[DPI]], [[__W]] +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[DPI]], <8 x i32> [[__W]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W + ret <8 x i32> %res +} + +define <8 x i32> @test_mm256_mask_dpwuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpwuud_epi32( +; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[DPI]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[DPI]], <8 x i32> zeroinitializer +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32>, <8 x i32>, <8 x i32>) + + +define { <8 x i16>, <8 x i16>, <8 x i16> } @test_mask_mpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) sanitize_memory { +; CHECK-LABEL: define { <8 x i16>, <8 x i16>, <8 x i16> } @test_mask_mpsadbw_128( +; CHECK-SAME: <16 x i8> [[X0:%.*]], <16 x i8> [[X1:%.*]], <8 x i16> [[X3:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[MSK:%.*]] = bitcast i8 [[X4]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[RS1:%.*]] = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> [[X0]], <16 x i8> [[X1]], i8 2) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]] +; CHECK: [[BB12]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB13]]: +; CHECK-NEXT: [[AD2:%.*]] = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> [[X0]], <16 x i8> [[X1]], i8 3) +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[MSK]], <8 x i16> zeroinitializer, <8 x i16> [[TMP4]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i16> [[AD2]], [[X3]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i16> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i16> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i16> [[TMP17]], <8 x i16> [[TMP14]] +; CHECK-NEXT: [[RS2:%.*]] = select <8 x i1> [[MSK]], <8 x i16> [[AD2]], <8 x i16> [[X3]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP18]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP19]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: br i1 [[_MSOR8]], label %[[BB20:.*]], label %[[BB21:.*]], !prof [[PROF1]] +; CHECK: [[BB20]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB21]]: +; CHECK-NEXT: [[AD3:%.*]] = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> [[X0]], <16 x i8> [[X1]], i8 4) +; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[MSK]], <8 x i16> zeroinitializer, <8 x i16> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i16> [[AD3]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i16> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i16> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP5]], <8 x i16> [[TMP25]], <8 x i16> [[TMP22]] +; CHECK-NEXT: [[RS3:%.*]] = select <8 x i1> [[MSK]], <8 x i16> [[AD3]], <8 x i16> zeroinitializer +; CHECK-NEXT: [[RS4:%.*]] = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> } poison, <8 x i16> [[RS1]], 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> } { <8 x i16> zeroinitializer, <8 x i16> splat (i16 -1), <8 x i16> splat (i16 -1) }, <8 x i16> [[_MSPROP_SELECT]], 1 +; CHECK-NEXT: [[RS5:%.*]] = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[RS4]], <8 x i16> [[RS2]], 1 +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP26]], <8 x i16> [[_MSPROP_SELECT1]], 2 +; CHECK-NEXT: [[RS6:%.*]] = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[RS5]], <8 x i16> [[RS3]], 2 +; CHECK-NEXT: store { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP27]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <8 x i16>, <8 x i16>, <8 x i16> } [[RS6]] +; + %msk = bitcast i8 %x4 to <8 x i1> + %rs1 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %x0, <16 x i8> %x1, i8 2) + %ad2 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %x0, <16 x i8> %x1, i8 3) + %rs2 = select <8 x i1> %msk, <8 x i16> %ad2, <8 x i16> %x3 + %ad3 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %x0, <16 x i8> %x1, i8 4) + %rs3 = select <8 x i1> %msk, <8 x i16> %ad3, <8 x i16> zeroinitializer + %rs4 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> } poison, <8 x i16> %rs1, 0 + %rs5 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> } %rs4, <8 x i16> %rs2, 1 + %rs6 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> } %rs5, <8 x i16> %rs3, 2 + ret { <8 x i16>, <8 x i16>, <8 x i16> } %rs6 +} + +define { <16 x i16>, <16 x i16>, <16 x i16> } @test_mask_mpsadbw_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x3, i16 %x4) sanitize_memory { +; CHECK-LABEL: define { <16 x i16>, <16 x i16>, <16 x i16> } @test_mask_mpsadbw_256( +; CHECK-SAME: <32 x i8> [[X0:%.*]], <32 x i8> [[X1:%.*]], <16 x i16> [[X3:%.*]], i16 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[MSK:%.*]] = bitcast i16 [[X4]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <32 x i8> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <32 x i8> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[RS1:%.*]] = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> [[X0]], <32 x i8> [[X1]], i8 2) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <32 x i8> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <32 x i8> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]] +; CHECK: [[BB12]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB13]]: +; CHECK-NEXT: [[AD2:%.*]] = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> [[X0]], <32 x i8> [[X1]], i8 3) +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[MSK]], <16 x i16> zeroinitializer, <16 x i16> [[TMP4]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i16> [[AD2]], [[X3]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i16> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i16> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i16> [[TMP17]], <16 x i16> [[TMP14]] +; CHECK-NEXT: [[RS2:%.*]] = select <16 x i1> [[MSK]], <16 x i16> [[AD2]], <16 x i16> [[X3]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <32 x i8> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i256 [[TMP18]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <32 x i8> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP19]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: br i1 [[_MSOR8]], label %[[BB20:.*]], label %[[BB21:.*]], !prof [[PROF1]] +; CHECK: [[BB20]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB21]]: +; CHECK-NEXT: [[AD3:%.*]] = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> [[X0]], <32 x i8> [[X1]], i8 4) +; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[MSK]], <16 x i16> zeroinitializer, <16 x i16> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <16 x i16> [[AD3]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i16> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i16> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP5]], <16 x i16> [[TMP25]], <16 x i16> [[TMP22]] +; CHECK-NEXT: [[RS3:%.*]] = select <16 x i1> [[MSK]], <16 x i16> [[AD3]], <16 x i16> zeroinitializer +; CHECK-NEXT: [[RS4:%.*]] = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> } poison, <16 x i16> [[RS1]], 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> } { <16 x i16> zeroinitializer, <16 x i16> splat (i16 -1), <16 x i16> splat (i16 -1) }, <16 x i16> [[_MSPROP_SELECT]], 1 +; CHECK-NEXT: [[RS5:%.*]] = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> } [[RS4]], <16 x i16> [[RS2]], 1 +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> } [[TMP26]], <16 x i16> [[_MSPROP_SELECT1]], 2 +; CHECK-NEXT: [[RS6:%.*]] = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> } [[RS5]], <16 x i16> [[RS3]], 2 +; CHECK-NEXT: store { <16 x i16>, <16 x i16>, <16 x i16> } [[TMP27]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i16>, <16 x i16>, <16 x i16> } [[RS6]] +; + %msk = bitcast i16 %x4 to <16 x i1> + %rs1 = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %x0, <32 x i8> %x1, i8 2) + %ad2 = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %x0, <32 x i8> %x1, i8 3) + %rs2 = select <16 x i1> %msk, <16 x i16> %ad2, <16 x i16> %x3 + %ad3 = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %x0, <32 x i8> %x1, i8 4) + %rs3 = select <16 x i1> %msk, <16 x i16> %ad3, <16 x i16> zeroinitializer + %rs4 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> } poison, <16 x i16> %rs1, 0 + %rs5 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> } %rs4, <16 x i16> %rs2, 1 + %rs6 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> } %rs5, <16 x i16> %rs3, 2 + ret { <16 x i16>, <16 x i16>, <16 x i16> } %rs6 +} + +declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) +declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) +;. +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll new file mode 100644 index 0000000000000..983d5aaada652 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll @@ -0,0 +1,655 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mattr=+avx512vnni,+avx512vl -passes=msan -S | FileCheck %s + +; Forked from llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics-upgrade.ll +; +; Handled strictly: (none) +; +; Handled heuristically: +; - llvm.x86.avx512.vpdpbusd.128 +; - llvm.x86.avx512.vpdpbusd.256 +; - llvm.x86.avx512.vpdpbusds.128 +; - llvm.x86.avx512.vpdpbusds.256 +; - llvm.x86.avx512.vpdpwssd.128 +; - llvm.x86.avx512.vpdpwssd.256 +; - llvm.x86.avx512.vpdpwssds.128 +; - llvm.x86.avx512.vpdpwssds.256 + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +declare <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpbusd_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP4]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + +define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) sanitize_memory { +; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i32> [[TMP23]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT4:%.*]] = select <8 x i1> [[TMP20]], <8 x i32> [[TMP25]], <8 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[TMP19]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i32>, <8 x i32> } { <8 x i32> splat (i32 -1), <8 x i32> splat (i32 -1) }, <8 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i32>, <8 x i32> } [[TMP27]], <8 x i32> [[_MSPROP_SELECT4]], 1 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i32>, <8 x i32> } [[RES2]], <8 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <8 x i32>, <8 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <8 x i32>, <8 x i32> } [[RES3]] +; + %x2 = load <8 x i32>, ptr %x2p + %res0 = call <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8 %x3) + %res2 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %res0, 0 + %res3 = insertvalue { <8 x i32>, <8 x i32> } %res2, <8 x i32> %res1, 1 + ret { <8 x i32>, <8 x i32> } %res3 +} + +declare <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) +declare <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpbusd_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP4]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + ret <4 x i32> %res +} + +define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) sanitize_memory { +; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> [[TMP13]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP2]], <4 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> +; CHECK-NEXT: [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x i1> [[TMP21]], <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[_MSPROP5]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <4 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <4 x i32> [[TMP23]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP25:%.*]] = or <4 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT7:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP25]], <4 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[TMP19]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <4 x i32>, <4 x i32> } { <4 x i32> splat (i32 -1), <4 x i32> splat (i32 -1) }, <4 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[TMP27]], <4 x i32> [[_MSPROP_SELECT7]], 1 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[RES2]], <4 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <4 x i32>, <4 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <4 x i32>, <4 x i32> } [[RES3]] +; + %x2 = load <4 x i32>, ptr %x2p + %res0 = call <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8 %x3) + %res2 = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> %res0, 0 + %res3 = insertvalue { <4 x i32>, <4 x i32> } %res2, <4 x i32> %res1, 1 + ret { <4 x i32>, <4 x i32> } %res3 +} + +declare <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +declare <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpbusds_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP4]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + +define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) sanitize_memory { +; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i32> [[TMP23]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT4:%.*]] = select <8 x i1> [[TMP20]], <8 x i32> [[TMP25]], <8 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[TMP19]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i32>, <8 x i32> } { <8 x i32> splat (i32 -1), <8 x i32> splat (i32 -1) }, <8 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i32>, <8 x i32> } [[TMP27]], <8 x i32> [[_MSPROP_SELECT4]], 1 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i32>, <8 x i32> } [[RES2]], <8 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <8 x i32>, <8 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <8 x i32>, <8 x i32> } [[RES3]] +; + %x2 = load <8 x i32>, ptr %x2p + %res0 = call <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8 %x3) + %res2 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %res0, 0 + %res3 = insertvalue { <8 x i32>, <8 x i32> } %res2, <8 x i32> %res1, 1 + ret { <8 x i32>, <8 x i32> } %res3 +} + +declare <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) +declare <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpbusds_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP4]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + ret <4 x i32> %res +} + +define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) sanitize_memory { +; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> [[TMP13]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP2]], <4 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> +; CHECK-NEXT: [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x i1> [[TMP21]], <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[_MSPROP5]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <4 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <4 x i32> [[TMP23]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP25:%.*]] = or <4 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT7:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP25]], <4 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[TMP19]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <4 x i32>, <4 x i32> } { <4 x i32> splat (i32 -1), <4 x i32> splat (i32 -1) }, <4 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[TMP27]], <4 x i32> [[_MSPROP_SELECT7]], 1 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[RES2]], <4 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <4 x i32>, <4 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <4 x i32>, <4 x i32> } [[RES3]] +; + %x2 = load <4 x i32>, ptr %x2p + %res0 = call <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8 %x3) + %res2 = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> %res0, 0 + %res3 = insertvalue { <4 x i32>, <4 x i32> } %res2, <4 x i32> %res1, 1 + ret { <4 x i32>, <4 x i32> } %res3 +} + +declare <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +declare <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpwssd_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP4]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + +define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) sanitize_memory { +; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i32> [[TMP23]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT4:%.*]] = select <8 x i1> [[TMP20]], <8 x i32> [[TMP25]], <8 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[TMP19]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i32>, <8 x i32> } { <8 x i32> splat (i32 -1), <8 x i32> splat (i32 -1) }, <8 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i32>, <8 x i32> } [[TMP27]], <8 x i32> [[_MSPROP_SELECT4]], 1 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i32>, <8 x i32> } [[RES2]], <8 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <8 x i32>, <8 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <8 x i32>, <8 x i32> } [[RES3]] +; + %x2 = load <8 x i32>, ptr %x2p + %res0 = call <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8 %x3) + %res2 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %res0, 0 + %res3 = insertvalue { <8 x i32>, <8 x i32> } %res2, <8 x i32> %res1, 1 + ret { <8 x i32>, <8 x i32> } %res3 +} + +declare <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) +declare <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpwssd_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP4]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + ret <4 x i32> %res +} + +define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) sanitize_memory { +; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> [[TMP13]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP2]], <4 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> +; CHECK-NEXT: [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x i1> [[TMP21]], <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[_MSPROP5]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <4 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <4 x i32> [[TMP23]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP25:%.*]] = or <4 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT7:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP25]], <4 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[TMP19]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <4 x i32>, <4 x i32> } { <4 x i32> splat (i32 -1), <4 x i32> splat (i32 -1) }, <4 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[TMP27]], <4 x i32> [[_MSPROP_SELECT7]], 1 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[RES2]], <4 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <4 x i32>, <4 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <4 x i32>, <4 x i32> } [[RES3]] +; + %x2 = load <4 x i32>, ptr %x2p + %res0 = call <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8 %x3) + %res2 = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> %res0, 0 + %res3 = insertvalue { <4 x i32>, <4 x i32> } %res2, <4 x i32> %res1, 1 + ret { <4 x i32>, <4 x i32> } %res3 +} + + +declare <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +declare <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpwssds_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP4]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + +define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) sanitize_memory { +; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i32> [[TMP23]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT4:%.*]] = select <8 x i1> [[TMP20]], <8 x i32> [[TMP25]], <8 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[TMP19]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i32>, <8 x i32> } { <8 x i32> splat (i32 -1), <8 x i32> splat (i32 -1) }, <8 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i32>, <8 x i32> } [[TMP27]], <8 x i32> [[_MSPROP_SELECT4]], 1 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i32>, <8 x i32> } [[RES2]], <8 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <8 x i32>, <8 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <8 x i32>, <8 x i32> } [[RES3]] +; + %x2 = load <8 x i32>, ptr %x2p + %res0 = call <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8 %x3) + %res2 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %res0, 0 + %res3 = insertvalue { <8 x i32>, <8 x i32> } %res2, <8 x i32> %res1, 1 + ret { <8 x i32>, <8 x i32> } %res3 +} + +declare <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) +declare <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpwssds_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP4]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + ret <4 x i32> %res +} + +define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) sanitize_memory { +; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> [[TMP13]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP2]], <4 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> +; CHECK-NEXT: [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x i1> [[TMP21]], <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[_MSPROP5]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <4 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <4 x i32> [[TMP23]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP25:%.*]] = or <4 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT7:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP25]], <4 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[TMP19]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <4 x i32>, <4 x i32> } { <4 x i32> splat (i32 -1), <4 x i32> splat (i32 -1) }, <4 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[TMP27]], <4 x i32> [[_MSPROP_SELECT7]], 1 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[RES2]], <4 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <4 x i32>, <4 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <4 x i32>, <4 x i32> } [[RES3]] +; + %x2 = load <4 x i32>, ptr %x2p + %res0 = call <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8 %x3) + %res2 = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> %res0, 0 + %res3 = insertvalue { <4 x i32>, <4 x i32> } %res2, <4 x i32> %res1, 1 + ret { <4 x i32>, <4 x i32> } %res3 +} +;. +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll new file mode 100644 index 0000000000000..234d68f1aaf56 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll @@ -0,0 +1,698 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mattr=+avx512vnni,+avx512vl -passes=msan -S | FileCheck %s + +; Forked from llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll +; +; Handled strictly: (none) +; +; Handled heuristically: +; - llvm.x86.avx512.vpdpbusd.128 +; - llvm.x86.avx512.vpdpbusd.256 +; - llvm.x86.avx512.vpdpbusds.128 +; - llvm.x86.avx512.vpdpbusds.256 +; - llvm.x86.avx512.vpdpwssd.128 +; - llvm.x86.avx512.vpdpwssd.256 +; - llvm.x86.avx512.vpdpwssds.128 +; - llvm.x86.avx512.vpdpwssds.256 + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpbusd_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP4]] +; + %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %1 +} + +define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) sanitize_memory { +; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i32> [[TMP23]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT4:%.*]] = select <8 x i1> [[TMP20]], <8 x i32> [[TMP25]], <8 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[TMP19]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i32>, <8 x i32> } { <8 x i32> splat (i32 -1), <8 x i32> splat (i32 -1) }, <8 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES1:%.*]] = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i32>, <8 x i32> } [[TMP27]], <8 x i32> [[_MSPROP_SELECT4]], 1 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <8 x i32>, <8 x i32> } [[RES1]], <8 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <8 x i32>, <8 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <8 x i32>, <8 x i32> } [[RES2]] +; + %x2 = load <8 x i32>, ptr %x2p + %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0 + %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer + %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0 + %res2 = insertvalue { <8 x i32>, <8 x i32> } %res1, <8 x i32> %6, 1 + ret { <8 x i32>, <8 x i32> } %res2 +} + +declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpbusd_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP4]] +; + %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %1 +} + +define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) sanitize_memory { +; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> [[TMP13]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> +; CHECK-NEXT: [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x i1> [[TMP21]], <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[_MSPROP4]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <4 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <4 x i32> [[TMP23]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP25:%.*]] = or <4 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT6:%.*]] = select <4 x i1> [[_MSPROP5]], <4 x i32> [[TMP25]], <4 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[TMP19]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <4 x i32>, <4 x i32> } { <4 x i32> splat (i32 -1), <4 x i32> splat (i32 -1) }, <4 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES1:%.*]] = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[TMP27]], <4 x i32> [[_MSPROP_SELECT6]], 1 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[RES1]], <4 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <4 x i32>, <4 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <4 x i32>, <4 x i32> } [[RES2]] +; + %x2 = load <4 x i32>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0 + %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %5 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> + %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer + %res1 = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> %3, 0 + %res2 = insertvalue { <4 x i32>, <4 x i32> } %res1, <4 x i32> %6, 1 + ret { <4 x i32>, <4 x i32> } %res2 +} + +declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpbusds_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP4]] +; + %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %1 +} + +define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) sanitize_memory { +; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i32> [[TMP23]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT4:%.*]] = select <8 x i1> [[TMP20]], <8 x i32> [[TMP25]], <8 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[TMP19]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i32>, <8 x i32> } { <8 x i32> splat (i32 -1), <8 x i32> splat (i32 -1) }, <8 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES1:%.*]] = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i32>, <8 x i32> } [[TMP27]], <8 x i32> [[_MSPROP_SELECT4]], 1 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <8 x i32>, <8 x i32> } [[RES1]], <8 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <8 x i32>, <8 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <8 x i32>, <8 x i32> } [[RES2]] +; + %x2 = load <8 x i32>, ptr %x2p + %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0 + %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer + %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0 + %res2 = insertvalue { <8 x i32>, <8 x i32> } %res1, <8 x i32> %6, 1 + ret { <8 x i32>, <8 x i32> } %res2 +} + +declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpbusds_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP4]] +; + %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %1 +} + +define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) sanitize_memory { +; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> [[TMP13]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> +; CHECK-NEXT: [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x i1> [[TMP21]], <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[_MSPROP4]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <4 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <4 x i32> [[TMP23]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP25:%.*]] = or <4 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT6:%.*]] = select <4 x i1> [[_MSPROP5]], <4 x i32> [[TMP25]], <4 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[TMP19]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <4 x i32>, <4 x i32> } { <4 x i32> splat (i32 -1), <4 x i32> splat (i32 -1) }, <4 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES1:%.*]] = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[TMP27]], <4 x i32> [[_MSPROP_SELECT6]], 1 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[RES1]], <4 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <4 x i32>, <4 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <4 x i32>, <4 x i32> } [[RES2]] +; + %x2 = load <4 x i32>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0 + %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %5 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> + %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer + %res1 = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> %3, 0 + %res2 = insertvalue { <4 x i32>, <4 x i32> } %res1, <4 x i32> %6, 1 + ret { <4 x i32>, <4 x i32> } %res2 +} + +declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpwssd_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP4]] +; + %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %1 +} + +define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) sanitize_memory { +; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i32> [[TMP23]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT4:%.*]] = select <8 x i1> [[TMP20]], <8 x i32> [[TMP25]], <8 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[TMP19]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i32>, <8 x i32> } { <8 x i32> splat (i32 -1), <8 x i32> splat (i32 -1) }, <8 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES1:%.*]] = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i32>, <8 x i32> } [[TMP27]], <8 x i32> [[_MSPROP_SELECT4]], 1 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <8 x i32>, <8 x i32> } [[RES1]], <8 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <8 x i32>, <8 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <8 x i32>, <8 x i32> } [[RES2]] +; + %x2 = load <8 x i32>, ptr %x2p + %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0 + %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer + %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0 + %res2 = insertvalue { <8 x i32>, <8 x i32> } %res1, <8 x i32> %6, 1 + ret { <8 x i32>, <8 x i32> } %res2 +} + +declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpwssd_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP4]] +; + %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %1 +} + +define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) sanitize_memory { +; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> [[TMP13]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> +; CHECK-NEXT: [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x i1> [[TMP21]], <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[_MSPROP4]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <4 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <4 x i32> [[TMP23]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP25:%.*]] = or <4 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT6:%.*]] = select <4 x i1> [[_MSPROP5]], <4 x i32> [[TMP25]], <4 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[TMP19]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <4 x i32>, <4 x i32> } { <4 x i32> splat (i32 -1), <4 x i32> splat (i32 -1) }, <4 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES1:%.*]] = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[TMP27]], <4 x i32> [[_MSPROP_SELECT6]], 1 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[RES1]], <4 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <4 x i32>, <4 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <4 x i32>, <4 x i32> } [[RES2]] +; + %x2 = load <4 x i32>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0 + %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %5 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> + %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer + %res1 = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> %3, 0 + %res2 = insertvalue { <4 x i32>, <4 x i32> } %res1, <4 x i32> %6, 1 + ret { <4 x i32>, <4 x i32> } %res2 +} + +declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpwssds_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP4]] +; + %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %1 +} + +define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) sanitize_memory { +; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i32> [[TMP23]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT4:%.*]] = select <8 x i1> [[TMP20]], <8 x i32> [[TMP25]], <8 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[TMP19]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i32>, <8 x i32> } { <8 x i32> splat (i32 -1), <8 x i32> splat (i32 -1) }, <8 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES1:%.*]] = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i32>, <8 x i32> } [[TMP27]], <8 x i32> [[_MSPROP_SELECT4]], 1 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <8 x i32>, <8 x i32> } [[RES1]], <8 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <8 x i32>, <8 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <8 x i32>, <8 x i32> } [[RES2]] +; + %x2 = load <8 x i32>, ptr %x2p + %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0 + %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer + %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0 + %res2 = insertvalue { <8 x i32>, <8 x i32> } %res1, <8 x i32> %6, 1 + ret { <8 x i32>, <8 x i32> } %res2 +} + +declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpwssds_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP9]] +; + %x2 = load <4 x i32>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %1 +} + +define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) sanitize_memory { +; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> [[TMP13]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> +; CHECK-NEXT: [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x i1> [[TMP21]], <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[_MSPROP4]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <4 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <4 x i32> [[TMP23]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP25:%.*]] = or <4 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT6:%.*]] = select <4 x i1> [[_MSPROP5]], <4 x i32> [[TMP25]], <4 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[TMP19]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <4 x i32>, <4 x i32> } { <4 x i32> splat (i32 -1), <4 x i32> splat (i32 -1) }, <4 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES1:%.*]] = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[TMP27]], <4 x i32> [[_MSPROP_SELECT6]], 1 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[RES1]], <4 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <4 x i32>, <4 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <4 x i32>, <4 x i32> } [[RES2]] +; + %x2 = load <4 x i32>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0 + %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %5 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> + %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer + %res1 = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> %3, 0 + %res2 = insertvalue { <4 x i32>, <4 x i32> } %res1, <4 x i32> %6, 1 + ret { <4 x i32>, <4 x i32> } %res2 +} +;. +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll new file mode 100644 index 0000000000000..77306202dc4fe --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll @@ -0,0 +1,327 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mattr=+avx512vnni -passes=msan -S | FileCheck %s + +; Forked from llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll +; +; Handled strictly: (none) +; +; Handled heuristically: +; - llvm.x86.avx512.vpdpbusd.512 +; - llvm.x86.avx512.vpdpbusds.512 +; - llvm.x86.avx512.vpdpwssd.512 +; - llvm.x86.avx512.vpdpwssds.512 + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpbusd_512( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP4]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) sanitize_memory { +; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P]], align 64 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <16 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i32> [[TMP23]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT4:%.*]] = select <16 x i1> [[TMP20]], <16 x i32> [[TMP25]], <16 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP19]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT4]], 1 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[RES2]], <16 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <16 x i32>, <16 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i32>, <16 x i32> } [[RES3]] +; + %x2 = load <16 x i32>, ptr %x2p + %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 %x3) + %res2 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0 + %res3 = insertvalue { <16 x i32>, <16 x i32> } %res2, <16 x i32> %res1, 1 + ret { <16 x i32>, <16 x i32> } %res3 +} + +declare <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpbusds_512( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP4]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) sanitize_memory { +; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P]], align 64 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <16 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i32> [[TMP23]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT4:%.*]] = select <16 x i1> [[TMP20]], <16 x i32> [[TMP25]], <16 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP19]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT4]], 1 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[RES2]], <16 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <16 x i32>, <16 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i32>, <16 x i32> } [[RES3]] +; + %x2 = load <16 x i32>, ptr %x2p + %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 %x3) + %res2 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0 + %res3 = insertvalue { <16 x i32>, <16 x i32> } %res2, <16 x i32> %res1, 1 + ret { <16 x i32>, <16 x i32> } %res3 +} + +declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpwssd_512( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP4]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) sanitize_memory { +; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P]], align 64 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <16 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i32> [[TMP23]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT4:%.*]] = select <16 x i1> [[TMP20]], <16 x i32> [[TMP25]], <16 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP19]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT4]], 1 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[RES2]], <16 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <16 x i32>, <16 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i32>, <16 x i32> } [[RES3]] +; + %x2 = load <16 x i32>, ptr %x2p + %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 %x3) + %res2 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0 + %res3 = insertvalue { <16 x i32>, <16 x i32> } %res2, <16 x i32> %res1, 1 + ret { <16 x i32>, <16 x i32> } %res3 +} + +declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpwssds_512( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP4]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) sanitize_memory { +; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P]], align 64 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <16 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i32> [[TMP23]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT4:%.*]] = select <16 x i1> [[TMP20]], <16 x i32> [[TMP25]], <16 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP19]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT4]], 1 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[RES2]], <16 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <16 x i32>, <16 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i32>, <16 x i32> } [[RES3]] +; + %x2 = load <16 x i32>, ptr %x2p + %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 %x3) + %res2 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0 + %res3 = insertvalue { <16 x i32>, <16 x i32> } %res2, <16 x i32> %res1, 1 + ret { <16 x i32>, <16 x i32> } %res3 +} + +;. +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll new file mode 100644 index 0000000000000..ca07d5905c8af --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll @@ -0,0 +1,338 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mattr=+avx512vnni -passes=msan -S | FileCheck %s + +; Forked from llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll +; +; Handled strictly: (none) +; +; Handled heuristically: +; - llvm.x86.avx512.vpdpbusd.512 +; - llvm.x86.avx512.vpdpbusds.512 +; - llvm.x86.avx512.vpdpwssd.512 +; - llvm.x86.avx512.vpdpwssds.512 + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP4]] +; + %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %1 +} + +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) sanitize_memory { +; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P]], align 64 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <16 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i32> [[TMP23]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT4:%.*]] = select <16 x i1> [[TMP20]], <16 x i32> [[TMP25]], <16 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP19]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES1:%.*]] = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT4]], 1 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[RES1]], <16 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <16 x i32>, <16 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i32>, <16 x i32> } [[RES2]] +; + %x2 = load <16 x i32>, ptr %x2p + %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4) + %5 = bitcast i16 %x3 to <16 x i1> + %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer + %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0 + %res2 = insertvalue { <16 x i32>, <16 x i32> } %res1, <16 x i32> %6, 1 + ret { <16 x i32>, <16 x i32> } %res2 +} + +declare <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpbusds_512( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP4]] +; + %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %1 +} + +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) sanitize_memory { +; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P]], align 64 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <16 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i32> [[TMP23]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT4:%.*]] = select <16 x i1> [[TMP20]], <16 x i32> [[TMP25]], <16 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP19]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES1:%.*]] = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT4]], 1 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[RES1]], <16 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <16 x i32>, <16 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i32>, <16 x i32> } [[RES2]] +; + %x2 = load <16 x i32>, ptr %x2p + %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4) + %5 = bitcast i16 %x3 to <16 x i1> + %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer + %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0 + %res2 = insertvalue { <16 x i32>, <16 x i32> } %res1, <16 x i32> %6, 1 + ret { <16 x i32>, <16 x i32> } %res2 +} + +declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpwssd_512( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP4]] +; + %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %1 +} + +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) sanitize_memory { +; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P]], align 64 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <16 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i32> [[TMP23]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT4:%.*]] = select <16 x i1> [[TMP20]], <16 x i32> [[TMP25]], <16 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP19]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES1:%.*]] = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT4]], 1 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[RES1]], <16 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <16 x i32>, <16 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i32>, <16 x i32> } [[RES2]] +; + %x2 = load <16 x i32>, ptr %x2p + %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4) + %5 = bitcast i16 %x3 to <16 x i1> + %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer + %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0 + %res2 = insertvalue { <16 x i32>, <16 x i32> } %res1, <16 x i32> %6, 1 + ret { <16 x i32>, <16 x i32> } %res2 +} + +declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_ask_vpdpwssds_512( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP4]] +; + %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %1 +} + +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) sanitize_memory { +; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P]], align 64 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], [[X0]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]]) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <16 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i32> [[TMP23]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT4:%.*]] = select <16 x i1> [[TMP20]], <16 x i32> [[TMP25]], <16 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP19]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES1:%.*]] = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP18]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT4]], 1 +; CHECK-NEXT: [[RES2:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[RES1]], <16 x i32> [[TMP26]], 1 +; CHECK-NEXT: store { <16 x i32>, <16 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i32>, <16 x i32> } [[RES2]] +; + %x2 = load <16 x i32>, ptr %x2p + %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4) + %5 = bitcast i16 %x3 to <16 x i1> + %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer + %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0 + %res2 = insertvalue { <16 x i32>, <16 x i32> } %res1, <16 x i32> %6, 1 + ret { <16 x i32>, <16 x i32> } %res2 +} +;. +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll new file mode 100644 index 0000000000000..0af0a89f177ee --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll @@ -0,0 +1,171 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mattr=+avx512vnni,+avx512vl,+avxvnni -passes=msan -S | FileCheck %s + +; Forked from llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll +; +; Handled strictly: (none) +; +; Handled heuristically: +; - llvm.x86.avx512.vpdpbusd.128 +; - llvm.x86.avx512.vpdpbusd.256 +; - llvm.x86.avx512.vpdpbusds.128 +; - llvm.x86.avx512.vpdpbusds.256 +; - llvm.x86.avx512.vpdpwssd.128 +; - llvm.x86.avx512.vpdpwssd.256 +; - llvm.x86.avx512.vpdpwssds.128 +; - llvm.x86.avx512.vpdpwssds.256 + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx_vpdpbusd_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %res = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx_vpdpbusd_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx_vpdpbusds_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx_vpdpbusds_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx_vpdpwssd_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx_vpdpwssd_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx_vpdpwssds_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx_vpdpwssds_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %res +} diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll new file mode 100644 index 0000000000000..66cbebee80dc3 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll @@ -0,0 +1,239 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mattr=+avxvnniint16 -passes=msan -S | FileCheck %s + +; Forked from llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll +; +; Handled strictly: (none) +; +; Handled heuristically: +; - llvm.x86.avx2.vpdpwsud.128 +; - llvm.x86.avx2.vpdpwsud.256 +; - llvm.x86.avx2.vpdpwsuds.128 +; - llvm.x86.avx2.vpdpwsuds.256 +; - llvm.x86.avx2.vpdpwusd.128 +; - llvm.x86.avx2.vpdpwusd.256 +; - llvm.x86.avx2.vpdpwusds.128 +; - llvm.x86.avx2.vpdpwusds.256 +; - llvm.x86.avx2.vpdpwuud.128 +; - llvm.x86.avx2.vpdpwuud.256 +; - llvm.x86.avx2.vpdpwuuds.128 +; - llvm.x86.avx2.vpdpwuuds.256 + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwsud_128( +; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RET]] +; + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwsud_256( +; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RET]] +; + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128( +; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RET]] +; + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256( +; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RET]] +; + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwusd_128( +; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RET]] +; + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwusd_256( +; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RET]] +; + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwusds_128( +; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RET]] +; + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwusds_256( +; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RET]] +; + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwuud_128( +; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RET]] +; + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwuud_256( +; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RET]] +; + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128( +; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RET]] +; + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256( +; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RET]] +; + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll new file mode 100644 index 0000000000000..d586c314ed28c --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll @@ -0,0 +1,494 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mattr=+avxvnniint8 -passes=msan -S | FileCheck %s + +; Forked from llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll +; +; Handled strictly: (none) +; +; Handled heuristically: +; - llvm.x86.avx2.vpdpbssd.128 +; - llvm.x86.avx2.vpdpbssd.256 +; - llvm.x86.avx2.vpdpbssds.128 +; - llvm.x86.avx2.vpdpbssds.256 +; - llvm.x86.avx2.vpdpbsud.128 +; - llvm.x86.avx2.vpdpbsud.256 +; - llvm.x86.avx2.vpdpbsuds.128 +; - llvm.x86.avx2.vpdpbsuds.256 +; - llvm.x86.avx2.vpdpbuud.128 +; - llvm.x86.avx2.vpdpbuud.256 +; - llvm.x86.avx2.vpdpbuuds.128 +; - llvm.x86.avx2.vpdpbuuds.256 + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbssd_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]] +; CHECK-NEXT: [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %x2 = load <4 x i32>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %res = add <4 x i32> %1, %2 + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbssds_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]] +; CHECK-NEXT: [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %x2 = load <4 x i32>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %res = add <4 x i32> %1, %2 + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbssd_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]] +; CHECK-NEXT: [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %x2 = load <8 x i32>, ptr %x2p + %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %res = add <8 x i32> %1, %2 + ret <8 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbssds_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]] +; CHECK-NEXT: [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %x2 = load <8 x i32>, ptr %x2p + %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %res = add <8 x i32> %1, %2 + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbsud_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]] +; CHECK-NEXT: [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %x2 = load <4 x i32>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %res = add <4 x i32> %1, %2 + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbsuds_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]] +; CHECK-NEXT: [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %x2 = load <4 x i32>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %res = add <4 x i32> %1, %2 + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbsud_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]] +; CHECK-NEXT: [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %x2 = load <8 x i32>, ptr %x2p + %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %res = add <8 x i32> %1, %2 + ret <8 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbsuds_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]] +; CHECK-NEXT: [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %x2 = load <8 x i32>, ptr %x2p + %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %res = add <8 x i32> %1, %2 + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbuud_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]] +; CHECK-NEXT: [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %x2 = load <4 x i32>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %res = add <4 x i32> %1, %2 + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) sanitize_memory { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbuuds_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]] +; CHECK-NEXT: [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %x2 = load <4 x i32>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %res = add <4 x i32> %1, %2 + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbuud_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]] +; CHECK-NEXT: [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %x2 = load <8 x i32>, ptr %x2p + %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %res = add <8 x i32> %1, %2 + ret <8 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) sanitize_memory { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbuuds_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]] +; CHECK-NEXT: [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %x2 = load <8 x i32>, ptr %x2p + %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %res = add <8 x i32> %1, %2 + ret <8 x i32> %res +} +;. +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;.