diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll new file mode 100644 index 0000000000000..f0b8f5b7988ca --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll @@ -0,0 +1,12090 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mattr=+avx512f -passes=msan -S | FileCheck %s + +; Forked from llvm/test/CodeGen/X86/avx512vl-intrinsics.ll + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define <2 x double> @test_mask_compress_pd_128(<2 x double> %data, <2 x double> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x double> @test_mask_compress_pd_128( +; CHECK-SAME: <2 x double> [[DATA:%.*]], <2 x double> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP9]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i2 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> [[DATA]], <2 x double> [[PASSTHRU]], <2 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> + %2 = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> %data, <2 x double> %passthru, <2 x i1> %extract) + ret <2 x double> %2 +} + +define <2 x double> @test_maskz_compress_pd_128(<2 x double> %data, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x double> @test_maskz_compress_pd_128( +; CHECK-SAME: <2 x double> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP7]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i2 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> [[DATA]], <2 x double> zeroinitializer, <2 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> + %2 = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> %data, <2 x double> zeroinitializer, <2 x i1> %extract) + ret <2 x double> %2 +} + +define <2 x double> @test_compress_pd_128(<2 x double> %data) #0 { +; CHECK-LABEL: define <2 x double> @test_compress_pd_128( +; CHECK-SAME: <2 x double> [[DATA:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> [[DATA]], <2 x double> undef, <2 x i1> splat (i1 true)) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP1]] +; + %1 = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> %data, <2 x double> undef, <2 x i1> ) + ret <2 x double> %1 +} + +define <4 x float> @test_mask_compress_ps_128(<4 x float> %data, <4 x float> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_mask_compress_ps_128( +; CHECK-SAME: <4 x float> [[DATA:%.*]], <4 x float> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP9]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> [[DATA]], <4 x float> [[PASSTHRU]], <4 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> + %2 = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> %data, <4 x float> %passthru, <4 x i1> %extract) + ret <4 x float> %2 +} + +define <4 x float> @test_maskz_compress_ps_128(<4 x float> %data, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_maskz_compress_ps_128( +; CHECK-SAME: <4 x float> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP7]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> [[DATA]], <4 x float> zeroinitializer, <4 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> + %2 = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> %data, <4 x float> zeroinitializer, <4 x i1> %extract) + ret <4 x float> %2 +} + +define <4 x float> @test_compress_ps_128(<4 x float> %data) #0 { +; CHECK-LABEL: define <4 x float> @test_compress_ps_128( +; CHECK-SAME: <4 x float> [[DATA:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> [[DATA]], <4 x float> undef, <4 x i1> splat (i1 true)) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; + %1 = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> %data, <4 x float> undef, <4 x i1> ) + ret <4 x float> %1 +} + +define <2 x i64> @test_mask_compress_q_128(<2 x i64> %data, <2 x i64> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x i64> @test_mask_compress_q_128( +; CHECK-SAME: <2 x i64> [[DATA:%.*]], <2 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP9]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i2 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> [[DATA]], <2 x i64> [[PASSTHRU]], <2 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> + %2 = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> %data, <2 x i64> %passthru, <2 x i1> %extract) + ret <2 x i64> %2 +} + +define <2 x i64> @test_maskz_compress_q_128(<2 x i64> %data, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x i64> @test_maskz_compress_q_128( +; CHECK-SAME: <2 x i64> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP7]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i2 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> [[DATA]], <2 x i64> zeroinitializer, <2 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> + %2 = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> %data, <2 x i64> zeroinitializer, <2 x i1> %extract) + ret <2 x i64> %2 +} + +define <2 x i64> @test_compress_q_128(<2 x i64> %data) #0 { +; CHECK-LABEL: define <2 x i64> @test_compress_q_128( +; CHECK-SAME: <2 x i64> [[DATA:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> [[DATA]], <2 x i64> undef, <2 x i1> splat (i1 true)) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; + %1 = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> %data, <2 x i64> undef, <2 x i1> ) + ret <2 x i64> %1 +} + +define <4 x i32> @test_mask_compress_d_128(<4 x i32> %data, <4 x i32> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_mask_compress_d_128( +; CHECK-SAME: <4 x i32> [[DATA:%.*]], <4 x i32> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP9]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> [[DATA]], <4 x i32> [[PASSTHRU]], <4 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> + %2 = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> %data, <4 x i32> %passthru, <4 x i1> %extract) + ret <4 x i32> %2 +} + +define <4 x i32> @test_maskz_compress_d_128(<4 x i32> %data, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_maskz_compress_d_128( +; CHECK-SAME: <4 x i32> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP7]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> [[DATA]], <4 x i32> zeroinitializer, <4 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> + %2 = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> %data, <4 x i32> zeroinitializer, <4 x i1> %extract) + ret <4 x i32> %2 +} + +define <4 x i32> @test_compress_d_128(<4 x i32> %data) #0 { +; CHECK-LABEL: define <4 x i32> @test_compress_d_128( +; CHECK-SAME: <4 x i32> [[DATA:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> [[DATA]], <4 x i32> undef, <4 x i1> splat (i1 true)) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; + %1 = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> %data, <4 x i32> undef, <4 x i1> ) + ret <4 x i32> %1 +} + +define <2 x double> @test_expand_pd_128(<2 x double> %data) #0 { +; CHECK-LABEL: define <2 x double> @test_expand_pd_128( +; CHECK-SAME: <2 x double> [[DATA:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> [[DATA]], <2 x double> undef, <2 x i1> splat (i1 true)) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP1]] +; + %1 = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> %data, <2 x double> undef, <2 x i1> ) + ret <2 x double> %1 +} + +define <2 x double> @test_mask_expand_pd_128(<2 x double> %data, <2 x double> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x double> @test_mask_expand_pd_128( +; CHECK-SAME: <2 x double> [[DATA:%.*]], <2 x double> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP9]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i2 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> [[DATA]], <2 x double> [[PASSTHRU]], <2 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> + %2 = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> %data, <2 x double> %passthru, <2 x i1> %extract) + ret <2 x double> %2 +} + +define <2 x double> @test_maskz_expand_pd_128(<2 x double> %data, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x double> @test_maskz_expand_pd_128( +; CHECK-SAME: <2 x double> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP7]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i2 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> [[DATA]], <2 x double> zeroinitializer, <2 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> + %2 = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> %data, <2 x double> zeroinitializer, <2 x i1> %extract) + ret <2 x double> %2 +} + +define <4 x float> @test_expand_ps_128(<4 x float> %data) #0 { +; CHECK-LABEL: define <4 x float> @test_expand_ps_128( +; CHECK-SAME: <4 x float> [[DATA:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> [[DATA]], <4 x float> undef, <4 x i1> splat (i1 true)) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; + %1 = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> %data, <4 x float> undef, <4 x i1> ) + ret <4 x float> %1 +} + +define <4 x float> @test_mask_expand_ps_128(<4 x float> %data, <4 x float> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_mask_expand_ps_128( +; CHECK-SAME: <4 x float> [[DATA:%.*]], <4 x float> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP9]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> [[DATA]], <4 x float> [[PASSTHRU]], <4 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> + %2 = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> %data, <4 x float> %passthru, <4 x i1> %extract) + ret <4 x float> %2 +} + +define <4 x float> @test_maskz_expand_ps_128(<4 x float> %data, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_maskz_expand_ps_128( +; CHECK-SAME: <4 x float> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP7]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> [[DATA]], <4 x float> zeroinitializer, <4 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> + %2 = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> %data, <4 x float> zeroinitializer, <4 x i1> %extract) + ret <4 x float> %2 +} + +define <2 x i64> @test_expand_q_128(<2 x i64> %data) #0 { +; CHECK-LABEL: define <2 x i64> @test_expand_q_128( +; CHECK-SAME: <2 x i64> [[DATA:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> [[DATA]], <2 x i64> undef, <2 x i1> splat (i1 true)) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; + %1 = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> %data, <2 x i64> undef, <2 x i1> ) + ret <2 x i64> %1 +} + +define <2 x i64> @test_mask_expand_q_128(<2 x i64> %data, <2 x i64> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x i64> @test_mask_expand_q_128( +; CHECK-SAME: <2 x i64> [[DATA:%.*]], <2 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP9]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i2 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> [[DATA]], <2 x i64> [[PASSTHRU]], <2 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> + %2 = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> %data, <2 x i64> %passthru, <2 x i1> %extract) + ret <2 x i64> %2 +} + +define <2 x i64> @test_maskz_expand_q_128(<2 x i64> %data, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x i64> @test_maskz_expand_q_128( +; CHECK-SAME: <2 x i64> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP7]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i2 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> [[DATA]], <2 x i64> zeroinitializer, <2 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> + %2 = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> %data, <2 x i64> zeroinitializer, <2 x i1> %extract) + ret <2 x i64> %2 +} + +define <4 x i32> @test_expand_d_128(<4 x i32> %data) #0 { +; CHECK-LABEL: define <4 x i32> @test_expand_d_128( +; CHECK-SAME: <4 x i32> [[DATA:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> [[DATA]], <4 x i32> undef, <4 x i1> splat (i1 true)) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; + %1 = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> %data, <4 x i32> undef, <4 x i1> ) + ret <4 x i32> %1 +} + +define <4 x i32> @test_mask_expand_d_128(<4 x i32> %data, <4 x i32> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_mask_expand_d_128( +; CHECK-SAME: <4 x i32> [[DATA:%.*]], <4 x i32> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP9]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> [[DATA]], <4 x i32> [[PASSTHRU]], <4 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> + %2 = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> %data, <4 x i32> %passthru, <4 x i1> %extract) + ret <4 x i32> %2 +} + +define <4 x i32> @test_maskz_expand_d_128(<4 x i32> %data, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_maskz_expand_d_128( +; CHECK-SAME: <4 x i32> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP7]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> [[DATA]], <4 x i32> zeroinitializer, <4 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> + %2 = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> %data, <4 x i32> zeroinitializer, <4 x i1> %extract) + ret <4 x i32> %2 +} + +define <4 x double> @test_mask_compress_pd_256(<4 x double> %data, <4 x double> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x double> @test_mask_compress_pd_256( +; CHECK-SAME: <4 x double> [[DATA:%.*]], <4 x double> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i64> [[TMP9]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> [[DATA]], <4 x double> [[PASSTHRU]], <4 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> + %2 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> %data, <4 x double> %passthru, <4 x i1> %extract) + ret <4 x double> %2 +} + +define <4 x double> @test_maskz_compress_pd_256(<4 x double> %data, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x double> @test_maskz_compress_pd_256( +; CHECK-SAME: <4 x double> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP7]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> [[DATA]], <4 x double> zeroinitializer, <4 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> + %2 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> %data, <4 x double> zeroinitializer, <4 x i1> %extract) + ret <4 x double> %2 +} + +define <4 x double> @test_compress_pd_256(<4 x double> %data) #0 { +; CHECK-LABEL: define <4 x double> @test_compress_pd_256( +; CHECK-SAME: <4 x double> [[DATA:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> [[DATA]], <4 x double> undef, <4 x i1> splat (i1 true)) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP1]] +; + %1 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> %data, <4 x double> undef, <4 x i1> ) + ret <4 x double> %1 +} + +define <8 x float> @test_mask_compress_ps_256(<8 x float> %data, <8 x float> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x float> @test_mask_compress_ps_256( +; CHECK-SAME: <8 x float> [[DATA:%.*]], <8 x float> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP9]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> [[DATA]], <8 x float> [[PASSTHRU]], <8 x i1> [[TMP1]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %2 = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> %data, <8 x float> %passthru, <8 x i1> %1) + ret <8 x float> %2 +} + +define <8 x float> @test_maskz_compress_ps_256(<8 x float> %data, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x float> @test_maskz_compress_ps_256( +; CHECK-SAME: <8 x float> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP7]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> [[DATA]], <8 x float> zeroinitializer, <8 x i1> [[TMP1]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %2 = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> %data, <8 x float> zeroinitializer, <8 x i1> %1) + ret <8 x float> %2 +} + +define <8 x float> @test_compress_ps_256(<8 x float> %data) #0 { +; CHECK-LABEL: define <8 x float> @test_compress_ps_256( +; CHECK-SAME: <8 x float> [[DATA:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> [[DATA]], <8 x float> undef, <8 x i1> splat (i1 true)) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP1]] +; + %1 = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> %data, <8 x float> undef, <8 x i1> ) + ret <8 x float> %1 +} + +define <4 x i64> @test_mask_compress_q_256(<4 x i64> %data, <4 x i64> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x i64> @test_mask_compress_q_256( +; CHECK-SAME: <4 x i64> [[DATA:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i64> [[TMP9]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> [[DATA]], <4 x i64> [[PASSTHRU]], <4 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> + %2 = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> %data, <4 x i64> %passthru, <4 x i1> %extract) + ret <4 x i64> %2 +} + +define <4 x i64> @test_maskz_compress_q_256(<4 x i64> %data, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x i64> @test_maskz_compress_q_256( +; CHECK-SAME: <4 x i64> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP7]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> [[DATA]], <4 x i64> zeroinitializer, <4 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> + %2 = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> %data, <4 x i64> zeroinitializer, <4 x i1> %extract) + ret <4 x i64> %2 +} + +define <4 x i64> @test_compress_q_256(<4 x i64> %data) #0 { +; CHECK-LABEL: define <4 x i64> @test_compress_q_256( +; CHECK-SAME: <4 x i64> [[DATA:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> [[DATA]], <4 x i64> undef, <4 x i1> splat (i1 true)) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; + %1 = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> %data, <4 x i64> undef, <4 x i1> ) + ret <4 x i64> %1 +} + +define <8 x i32> @test_mask_compress_d_256(<8 x i32> %data, <8 x i32> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x i32> @test_mask_compress_d_256( +; CHECK-SAME: <8 x i32> [[DATA:%.*]], <8 x i32> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP9]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> [[DATA]], <8 x i32> [[PASSTHRU]], <8 x i1> [[TMP1]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %2 = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> %data, <8 x i32> %passthru, <8 x i1> %1) + ret <8 x i32> %2 +} + +define <8 x i32> @test_maskz_compress_d_256(<8 x i32> %data, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x i32> @test_maskz_compress_d_256( +; CHECK-SAME: <8 x i32> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP7]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> [[DATA]], <8 x i32> zeroinitializer, <8 x i1> [[TMP1]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %2 = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> %data, <8 x i32> zeroinitializer, <8 x i1> %1) + ret <8 x i32> %2 +} + +define <8 x i32> @test_compress_d_256(<8 x i32> %data) #0 { +; CHECK-LABEL: define <8 x i32> @test_compress_d_256( +; CHECK-SAME: <8 x i32> [[DATA:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> [[DATA]], <8 x i32> undef, <8 x i1> splat (i1 true)) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP1]] +; + %1 = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> %data, <8 x i32> undef, <8 x i1> ) + ret <8 x i32> %1 +} + +define <4 x double> @test_expand_pd_256(<4 x double> %data) #0 { +; CHECK-LABEL: define <4 x double> @test_expand_pd_256( +; CHECK-SAME: <4 x double> [[DATA:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> [[DATA]], <4 x double> undef, <4 x i1> splat (i1 true)) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP1]] +; + %1 = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> %data, <4 x double> undef, <4 x i1> ) + ret <4 x double> %1 +} + +define <4 x double> @test_mask_expand_pd_256(<4 x double> %data, <4 x double> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x double> @test_mask_expand_pd_256( +; CHECK-SAME: <4 x double> [[DATA:%.*]], <4 x double> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i64> [[TMP9]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> [[DATA]], <4 x double> [[PASSTHRU]], <4 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> + %2 = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> %data, <4 x double> %passthru, <4 x i1> %extract) + ret <4 x double> %2 +} + +define <4 x double> @test_maskz_expand_pd_256(<4 x double> %data, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x double> @test_maskz_expand_pd_256( +; CHECK-SAME: <4 x double> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP7]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> [[DATA]], <4 x double> zeroinitializer, <4 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> + %2 = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> %data, <4 x double> zeroinitializer, <4 x i1> %extract) + ret <4 x double> %2 +} + +define <8 x float> @test_expand_ps_256(<8 x float> %data) #0 { +; CHECK-LABEL: define <8 x float> @test_expand_ps_256( +; CHECK-SAME: <8 x float> [[DATA:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> [[DATA]], <8 x float> undef, <8 x i1> splat (i1 true)) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP1]] +; + %1 = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> %data, <8 x float> undef, <8 x i1> ) + ret <8 x float> %1 +} + +define <8 x float> @test_mask_expand_ps_256(<8 x float> %data, <8 x float> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x float> @test_mask_expand_ps_256( +; CHECK-SAME: <8 x float> [[DATA:%.*]], <8 x float> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP9]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> [[DATA]], <8 x float> [[PASSTHRU]], <8 x i1> [[TMP1]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %2 = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> %data, <8 x float> %passthru, <8 x i1> %1) + ret <8 x float> %2 +} + +define <8 x float> @test_maskz_expand_ps_256(<8 x float> %data, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x float> @test_maskz_expand_ps_256( +; CHECK-SAME: <8 x float> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP7]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> [[DATA]], <8 x float> zeroinitializer, <8 x i1> [[TMP1]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %2 = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> %data, <8 x float> zeroinitializer, <8 x i1> %1) + ret <8 x float> %2 +} + +define <4 x i64> @test_expand_q_256(<4 x i64> %data) #0 { +; CHECK-LABEL: define <4 x i64> @test_expand_q_256( +; CHECK-SAME: <4 x i64> [[DATA:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> [[DATA]], <4 x i64> undef, <4 x i1> splat (i1 true)) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; + %1 = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> %data, <4 x i64> undef, <4 x i1> ) + ret <4 x i64> %1 +} + +define <4 x i64> @test_mask_expand_q_256(<4 x i64> %data, <4 x i64> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x i64> @test_mask_expand_q_256( +; CHECK-SAME: <4 x i64> [[DATA:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i64> [[TMP9]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> [[DATA]], <4 x i64> [[PASSTHRU]], <4 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> + %2 = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> %data, <4 x i64> %passthru, <4 x i1> %extract) + ret <4 x i64> %2 +} + +define <4 x i64> @test_maskz_expand_q_256(<4 x i64> %data, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x i64> @test_maskz_expand_q_256( +; CHECK-SAME: <4 x i64> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP7]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> [[DATA]], <4 x i64> zeroinitializer, <4 x i1> [[EXTRACT]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> + %2 = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> %data, <4 x i64> zeroinitializer, <4 x i1> %extract) + ret <4 x i64> %2 +} + +define <8 x i32> @test_expand_d_256(<8 x i32> %data) #0 { +; CHECK-LABEL: define <8 x i32> @test_expand_d_256( +; CHECK-SAME: <8 x i32> [[DATA:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> [[DATA]], <8 x i32> undef, <8 x i1> splat (i1 true)) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP1]] +; + %1 = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> %data, <8 x i32> undef, <8 x i1> ) + ret <8 x i32> %1 +} + +define <8 x i32> @test_mask_expand_d_256(<8 x i32> %data, <8 x i32> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x i32> @test_mask_expand_d_256( +; CHECK-SAME: <8 x i32> [[DATA:%.*]], <8 x i32> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP9]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> [[DATA]], <8 x i32> [[PASSTHRU]], <8 x i1> [[TMP1]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %2 = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> %data, <8 x i32> %passthru, <8 x i1> %1) + ret <8 x i32> %2 +} + +define <8 x i32> @test_maskz_expand_d_256(<8 x i32> %data, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x i32> @test_maskz_expand_d_256( +; CHECK-SAME: <8 x i32> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP7]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> [[DATA]], <8 x i32> zeroinitializer, <8 x i1> [[TMP1]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = bitcast i8 %mask to <8 x i1> + %2 = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> %data, <8 x i32> zeroinitializer, <8 x i1> %1) + ret <8 x i32> %2 +} + +define i8 @test_cmpps_256(<8 x float> %a, <8 x float> %b) #0 { +; CHECK-LABEL: define i8 @test_cmpps_256( +; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP5]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[A]], <8 x float> [[B]], i32 2, <8 x i1> splat (i1 true)) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[RES]] to i8 +; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i8 [[TMP1]] +; + %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2, <8 x i1> ) + %1 = bitcast <8 x i1> %res to i8 + ret i8 %1 +} +declare <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, <8 x i1>) + +define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: define i8 @test_cmpps_128( +; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP5]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP6]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[A]], <4 x float> [[B]], i32 2, <4 x i1> splat (i1 true)) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[RES]], <4 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i8 [[TMP2]] +; + %res = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2, <4 x i1> ) + %1 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 +} +declare <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, <4 x i1>) + +define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) #0 { +; CHECK-LABEL: define i8 @test_cmppd_256( +; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP6]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[A]], <4 x double> [[B]], i32 2, <4 x i1> splat (i1 true)) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[RES]], <4 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i8 [[TMP2]] +; + %res = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2, <4 x i1> ) + %1 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 +} +declare <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32, <4 x i1>) + +define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: define i8 @test_cmppd_128( +; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP5]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP6]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES:%.*]] = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[A]], <2 x double> [[B]], i32 2, <2 x i1> splat (i1 true)) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i1> [[RES]], <2 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i8 [[TMP2]] +; + %res = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2, <2 x i1> ) + %1 = shufflevector <2 x i1> %res, <2 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 +} +declare <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32, <2 x i1>) + +define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x float> @test_mm512_maskz_max_ps_256( +; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[A0]], <8 x float> [[A1]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = xor <8 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i32> [[TMP5]], [[_MSPROP]] +; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i32> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> [[TMP7]], <8 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> zeroinitializer +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP8]] +; + %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer + ret <8 x float> %3 +} + +define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x float> @test_mm512_mask_max_ps_256( +; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[SRC:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[A0]], <8 x float> [[A1]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP]], <8 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x float> [[SRC]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = xor <8 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i32> [[TMP6]], [[_MSPROP]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i32> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP14]], <8 x i32> [[TMP13]], <8 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[SRC]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP9]] +; + %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %src + ret <8 x float> %3 +} + +define <8 x float> @test_mm512_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) #0 { +; CHECK-LABEL: define <8 x float> @test_mm512_max_ps_256( +; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[A0]], <8 x float> [[A1]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP1]] +; + %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) + ret <8 x float> %1 +} +declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) + +define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_mm512_maskz_max_ps_128( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[A0]], <4 x float> [[A1]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> splat (i1 true), <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i32> [[TMP5]], [[_MSPROP]] +; CHECK-NEXT: [[TMP7:%.*]] = or <4 x i32> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP7]], <4 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> zeroinitializer +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP8]] +; + %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> + %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> zeroinitializer + ret <4 x float> %3 +} + +define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_mm512_mask_max_ps_128( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[SRC:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[A0]], <4 x float> [[A1]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> splat (i1 true), <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x float> [[SRC]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP]] +; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i32> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP13]], <4 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[SRC]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP9]] +; + %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> + %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %src + ret <4 x float> %3 +} + +define <4 x float> @test_mm512_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 { +; CHECK-LABEL: define <4 x float> @test_mm512_max_ps_128( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[A0]], <4 x float> [[A1]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; + %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %1 +} +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) + +define <8 x float> @test_mm512_maskz_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x float> @test_mm512_maskz_min_ps_256( +; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[A0]], <8 x float> [[A1]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = xor <8 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i32> [[TMP5]], [[_MSPROP]] +; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i32> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> [[TMP7]], <8 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> zeroinitializer +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP8]] +; + %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer + ret <8 x float> %3 +} + +define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x float> @test_mm512_mask_min_ps_256( +; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[SRC:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[A0]], <8 x float> [[A1]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP]], <8 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x float> [[SRC]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = xor <8 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i32> [[TMP6]], [[_MSPROP]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i32> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP14]], <8 x i32> [[TMP13]], <8 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[SRC]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP9]] +; + %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %src + ret <8 x float> %3 +} + +define <8 x float> @test_mm512_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) #0 { +; CHECK-LABEL: define <8 x float> @test_mm512_min_ps_256( +; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[A0]], <8 x float> [[A1]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP1]] +; + %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) + ret <8 x float> %1 +} +declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) + +define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_mm512_maskz_min_ps_128( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[A0]], <4 x float> [[A1]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> splat (i1 true), <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i32> [[TMP5]], [[_MSPROP]] +; CHECK-NEXT: [[TMP7:%.*]] = or <4 x i32> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP7]], <4 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> zeroinitializer +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP8]] +; + %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> + %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> zeroinitializer + ret <4 x float> %3 +} + +define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_mm512_mask_min_ps_128( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[SRC:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[A0]], <4 x float> [[A1]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> splat (i1 true), <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x float> [[SRC]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP]] +; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i32> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP13]], <4 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[SRC]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP9]] +; + %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> + %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %src + ret <4 x float> %3 +} + +define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 { +; CHECK-LABEL: define <4 x float> @test_mm512_min_ps_128( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[A0]], <4 x float> [[A1]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; + %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %1 +} +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) + +define <4 x double> @test_getexp_pd_256(<4 x double> %a0) #0 { +; CHECK-LABEL: define <4 x double> @test_getexp_pd_256( +; CHECK-SAME: <4 x double> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB4]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 -1) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[RES]] +; + %res = call <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1) + ret <4 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone + +define <8 x float> @test_getexp_ps_256(<8 x float> %a0) #0 { +; CHECK-LABEL: define <8 x float> @test_getexp_ps_256( +; CHECK-SAME: <8 x float> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB4]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 -1) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[RES]] +; + %res = call <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone + +declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx512_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) #0 { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpermi2var_d_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP2]] +; + %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %1 +} + +define <4 x i32>@test_int_x86_avx512_mask_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_vpermi2var_d_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i32> [[TMP2]], [[X1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i32> [[TMP5]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <4 x i32> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP2]], <4 x i32> [[X1]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP8]] +; + %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x1 + ret <4 x i32> %3 +} + +define <4 x i32>@test_int_x86_avx512_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) #0 { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpermt2var_d_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP2]] +; + %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) + ret <4 x i32> %1 +} + +define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_vpermt2var_d_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i32> [[TMP2]], [[X1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i32> [[TMP5]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <4 x i32> [[TMP10]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP2]], <4 x i32> [[X1]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP8]] +; + %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x1 + ret <4 x i32> %3 +} + +define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_vpermt2var_d_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP10]] +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i32> [[TMP5]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP7:%.*]] = or <4 x i32> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP7]], <4 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP8]] +; + %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer + ret <4 x i32> %3 +} + +declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx512_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) #0 { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpermi2var_d_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %1 +} + +define <8 x i32>@test_int_x86_avx512_mask_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_vpermi2var_d_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <8 x i32> [[TMP2]], [[X1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i32> [[TMP5]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i32> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[TMP11]], <8 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[TMP2]], <8 x i32> [[X1]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP8]] +; + %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1 + ret <8 x i32> %3 +} + +define <8 x i32>@test_int_x86_avx512_ask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) #0 { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_ask_vpermt2var_d_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) + ret <8 x i32> %1 +} + +define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_vpermt2var_d_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <8 x i32> [[TMP2]], [[X1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i32> [[TMP5]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i32> [[TMP10]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[TMP11]], <8 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[TMP2]], <8 x i32> [[X1]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP8]] +; + %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1 + ret <8 x i32> %3 +} + +define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_maskz_vpermt2var_d_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP10]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = xor <8 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i32> [[TMP5]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i32> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> [[TMP7]], <8 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[TMP2]], <8 x i32> zeroinitializer +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP8]] +; + %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer + ret <8 x i32> %3 +} + +declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>) + +define <2 x double>@test_int_x86_avx512_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) #0 { +; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_vpermi2var_pd_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x double> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP7]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[X0]], <2 x i64> [[X1]], <2 x double> [[X2]]) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP3]] +; + %1 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) + ret <2 x double> %1 +} + +define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_vpermi2var_pd_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP15]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP11]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[X0]], <2 x i64> [[X1]], <2 x double> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[X1]] to <2 x double> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP13]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> zeroinitializer, <2 x i64> [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x double> [[TMP3]] to <2 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x double> [[TMP4]] to <2 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = xor <2 x i64> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = or <2 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = or <2 x i64> [[TMP10]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP19]], <2 x i64> [[TMP14]] +; CHECK-NEXT: [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP3]], <2 x double> [[TMP4]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP12]] +; + %1 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) + %2 = bitcast <2 x i64> %x1 to <2 x double> + %3 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <2 x i32> + %4 = select <2 x i1> %extract, <2 x double> %1, <2 x double> %2 + ret <2 x double> %4 +} + +declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>) + +define <4 x double>@test_int_x86_avx512_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) #0 { +; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_vpermi2var_pd_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i64> [[TMP7]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[X0]], <4 x i64> [[X1]], <4 x double> [[X2]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP3]] +; + %1 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) + ret <4 x double> %1 +} + +define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_vpermi2var_pd_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP15]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i64> [[TMP11]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[X0]], <4 x i64> [[X1]], <4 x double> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[X1]] to <4 x double> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP13]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x double> [[TMP3]] to <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x double> [[TMP4]] to <4 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i64> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = or <4 x i64> [[TMP10]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP19]], <4 x i64> [[TMP14]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP3]], <4 x double> [[TMP4]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP12]] +; + %1 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) + %2 = bitcast <4 x i64> %x1 to <4 x double> + %3 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> + %4 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %2 + ret <4 x double> %4 +} + +declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>) + +define <4 x float>@test_int_x86_avx512_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) #0 { +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_vpermi2var_ps_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x float> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP7]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1]], <4 x float> [[X2]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP3]] +; + %1 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) + ret <4 x float> %1 +} + +define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_vpermi2var_ps_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP15]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP11]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1]], <4 x float> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <4 x float> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP13]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x float> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x float> [[TMP4]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = or <4 x i32> [[TMP10]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP19]], <4 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP3]], <4 x float> [[TMP4]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP12]] +; + %1 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) + %2 = bitcast <4 x i32> %x1 to <4 x float> + %3 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> + %4 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %2 + ret <4 x float> %4 +} + +define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> %x0, <2 x i64> %x1, <4 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_vpermi2var_ps_128_cast( +; CHECK-SAME: <4 x float> [[X0:%.*]], <2 x i64> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[X1CAST:%.*]] = bitcast <2 x i64> [[X1]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i32> [[TMP16]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP17]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x i32> [[TMP11]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP18]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1CAST]], <4 x float> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[X1CAST]] to <4 x float> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> [[TMP13]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP16]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x float> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x float> [[TMP4]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = or <4 x i32> [[TMP10]], [[TMP16]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP20]], <4 x i32> [[TMP15]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP3]], <4 x float> [[TMP4]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP12]] +; + %x1cast = bitcast <2 x i64> %x1 to <4 x i32> + %1 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1cast, <4 x float> %x2) + %2 = bitcast <4 x i32> %x1cast to <4 x float> + %3 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> + %4 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %2 + ret <4 x float> %4 +} + +declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>) + +define <8 x float>@test_int_x86_avx512_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) #0 { +; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_vpermi2var_ps_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x float> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP7]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[X0]], <8 x i32> [[X1]], <8 x float> [[X2]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP3]] +; + %1 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) + ret <8 x float> %1 +} + +define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_vpermi2var_ps_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP15]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i32> [[TMP11]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[X0]], <8 x i32> [[X1]], <8 x float> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <8 x float> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP13]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> zeroinitializer, <8 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x float> [[TMP3]] to <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x float> [[TMP4]] to <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i32> [[TMP10]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> [[TMP19]], <8 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP5]], <8 x float> [[TMP3]], <8 x float> [[TMP4]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP12]] +; + %1 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) + %2 = bitcast <8 x i32> %x1 to <8 x float> + %3 = bitcast i8 %x3 to <8 x i1> + %4 = select <8 x i1> %3, <8 x float> %1, <8 x float> %2 + ret <8 x float> %4 +} + +declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>) + +define <2 x i64>@test_int_x86_avx512_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) #0 { +; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_vpermi2var_q_128( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]]) +; CHECK-NEXT: store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP2]] +; + %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) + ret <2 x i64> %1 +} + +define <2 x i64>@test_int_x86_avx512_mask_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_mask_vpermi2var_q_128( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP1]], <2 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i64> [[TMP2]], [[X1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <2 x i64> [[TMP5]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <2 x i64> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP2]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP2]], <2 x i64> [[X1]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP8]] +; + %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> + %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x1 + ret <2 x i64> %3 +} + +define <2 x i64>@test_int_x86_avx512_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) #0 { +; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_vpermt2var_q_128( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]]) +; CHECK-NEXT: store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP2]] +; + %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) + ret <2 x i64> %1 +} + +define <2 x i64>@test_int_x86_avx512_mask_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_mask_vpermt2var_q_128( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP1]], <2 x i64> [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i64> [[TMP2]], [[X1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <2 x i64> [[TMP5]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <2 x i64> [[TMP10]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP2]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP2]], <2 x i64> [[X1]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP8]] +; + %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> + %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x1 + ret <2 x i64> %3 +} + +define <2 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_maskz_vpermt2var_q_128( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP10]] +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP1]], <2 x i64> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <2 x i64> [[TMP5]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP7:%.*]] = or <2 x i64> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP2]], <2 x i64> [[TMP7]], <2 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP2]], <2 x i64> zeroinitializer +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP8]] +; + %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> + %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> zeroinitializer + ret <2 x i64> %3 +} + +declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>) + +define <4 x i64>@test_int_x86_avx512_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) #0 { +; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_vpermi2var_q_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]]) +; CHECK-NEXT: store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP2]] +; + %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) + ret <4 x i64> %1 +} + +define <4 x i64>@test_int_x86_avx512_mask_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_vpermi2var_q_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP1]], <4 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i64> [[TMP2]], [[X1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i64> [[TMP5]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <4 x i64> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i64> [[TMP11]], <4 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP2]], <4 x i64> [[X1]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP8]] +; + %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x1 + ret <4 x i64> %3 +} + +define <4 x i64>@test_int_x86_avx512_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) #0 { +; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_vpermt2var_q_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]]) +; CHECK-NEXT: store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP2]] +; + %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) + ret <4 x i64> %1 +} + +define <4 x i64>@test_int_x86_avx512_mask_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_vpermt2var_q_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP1]], <4 x i64> [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i64> [[TMP2]], [[X1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i64> [[TMP5]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <4 x i64> [[TMP10]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i64> [[TMP11]], <4 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP2]], <4 x i64> [[X1]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP8]] +; + %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x1 + ret <4 x i64> %3 +} + +define <4 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_maskz_vpermt2var_q_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP10]] +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP1]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i64> [[TMP5]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP7:%.*]] = or <4 x i64> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i64> [[TMP7]], <4 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP2]], <4 x i64> zeroinitializer +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP8]] +; + %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> zeroinitializer + ret <4 x i64> %3 +} + +declare <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_scalef_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2) #0 { +; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_scalef_pd_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]], i8 -1) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) + ret <2 x double> %res +} + +define <2 x double>@test_int_x86_avx512_mask_scalef_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_scalef_pd_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]], i8 [[X3]]) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) + ret <2 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_scalef_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2) #0 { +; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_scalef_pd_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]], i8 -1) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[RES]] +; + %res = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) + ret <4 x double> %res +} + +define <4 x double>@test_int_x86_avx512_mask_scalef_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_scalef_pd_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]], i8 [[X3]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[RES]] +; + %res = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) + ret <4 x double> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_scalef_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) #0 { +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_scalef_ps_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]], i8 -1) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) + ret <4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_mask_scalef_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_scalef_ps_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]], i8 [[X3]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_scalef_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2) #0 { +; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_scalef_ps_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]], i8 -1) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[RES]] +; + %res = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) + ret <8 x float> %res +} + +define <8 x float>@test_int_x86_avx512_mask_scalef_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_scalef_ps_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]], i8 [[X3]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[RES]] +; + %res = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + ret <8 x float> %res +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmov_qb_128( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> [[X0]], <16 x i8> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[RES4]] +; + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qb.mem.128(ptr %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qb_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qb_mem_128( +; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qb.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qb.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmov.qb.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qb.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovs_qb_128( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> [[X0]], <16 x i8> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[RES4]] +; + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qb.mem.128(ptr %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qb_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qb_mem_128( +; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovus_qb_128( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> [[X0]], <16 x i8> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[RES4]] +; + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qb.mem.128(ptr %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qb_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qb_mem_128( +; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmov_qb_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> [[X0]], <16 x i8> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[RES4]] +; + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qb.mem.256(ptr %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qb_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qb_mem_256( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qb.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qb.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmov.qb.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qb.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovs_qb_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> [[X0]], <16 x i8> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[RES4]] +; + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qb.mem.256(ptr %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qb_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qb_mem_256( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovus_qb_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> [[X0]], <16 x i8> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[RES4]] +; + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qb.mem.256(ptr %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qb_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qb_mem_256( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmov_qw_128( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> [[X0]], <8 x i16> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[RES4]] +; + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qw.mem.128(ptr %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qw_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qw_mem_128( +; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qw.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qw.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmov.qw.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qw.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovs_qw_128( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> [[X0]], <8 x i16> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[RES4]] +; + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qw.mem.128(ptr %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qw_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qw_mem_128( +; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovus_qw_128( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> [[X0]], <8 x i16> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[RES4]] +; + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qw.mem.128(ptr %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qw_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qw_mem_128( +; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmov_qw_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> [[X0]], <8 x i16> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[RES4]] +; + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qw.mem.256(ptr %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qw_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qw_mem_256( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qw.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qw.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmov.qw.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qw.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovs_qw_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> [[X0]], <8 x i16> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[RES4]] +; + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qw.mem.256(ptr %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qw_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qw_mem_256( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovus_qw_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> [[X0]], <8 x i16> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[RES4]] +; + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qw.mem.256(ptr %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qw_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qw_mem_256( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmov_qd_128( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> [[X0]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> [[X0]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> [[X0]], <4 x i32> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <4 x i32> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <4 x i32> [[RES3]], [[RES2]] +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES4]] +; + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qd.mem.128(ptr %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qd_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qd_mem_128( +; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qd.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qd.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmov.qd.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qd.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmovs_qd_128( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> [[X0]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> [[X0]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> [[X0]], <4 x i32> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <4 x i32> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <4 x i32> [[RES3]], [[RES2]] +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES4]] +; + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qd.mem.128(ptr %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qd_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qd_mem_128( +; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmovus_qd_128( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> [[X0]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> [[X0]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> [[X0]], <4 x i32> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <4 x i32> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <4 x i32> [[RES3]], [[RES2]] +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES4]] +; + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qd.mem.128(ptr %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qd_mem_128( +; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +define <4 x i32>@test_int_x86_avx512_pmov_qd_256(<4 x i64> %x0) #0 { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_pmov_qd_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = trunc <4 x i64> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[X0]] to <4 x i32> +; CHECK-NEXT: store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; + %1 = trunc <4 x i64> %x0 to <4 x i32> + ret <4 x i32> %1 +} + +define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmov_qd_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = trunc <4 x i64> [[TMP6]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[X0]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP8]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X2]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[_MSPROP]], <4 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[X1]] +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP4]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i32> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[TMP1]], <4 x i32> [[X1]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP7]] +; + %1 = trunc <4 x i64> %x0 to <4 x i32> + %2 = bitcast i8 %x2 to <8 x i1> + %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract1, <4 x i32> %1, <4 x i32> %x1 + ret <4 x i32> %3 +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmov_qd_256(<4 x i64> %x0, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_pmov_qd_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[X0]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP8]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X2]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[TMP4]], [[_MSPROP]] +; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i32> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP6]], <4 x i32> [[TMP10]] +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP7]] +; + %1 = trunc <4 x i64> %x0 to <4 x i32> + %2 = bitcast i8 %x2 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer + ret <4 x i32> %3 +} + +declare void @llvm.x86.avx512.mask.pmov.qd.mem.256(ptr %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qd_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qd_mem_256( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qd.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qd.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmov.qd.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qd.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1) #0 { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_pmovs_qd_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> [[X0]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmovs_qd_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> [[X0]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmovs_qd_256(<4 x i64> %x0, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_pmovs_qd_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> [[X0]], <4 x i32> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res +} + +declare void @llvm.x86.avx512.mask.pmovs.qd.mem.256(ptr %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qd_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qd_mem_256( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1) #0 { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_pmovus_qd_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> [[X0]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmovus_qd_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> [[X0]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmovus_qd_256(<4 x i64> %x0, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_pmovus_qd_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> [[X0]], <4 x i32> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res +} + +declare void @llvm.x86.avx512.mask.pmovus.qd.mem.256(ptr %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qd_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qd_mem_256( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmov_db_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> [[X0]], <16 x i8> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> [[X0]], <16 x i8> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[RES4]] +; + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.db.mem.128(ptr %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmov_db_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_db_mem_128( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.db.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.db.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmov.db.mem.128(ptr %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.db.mem.128(ptr %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovs_db_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> [[X0]], <16 x i8> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> [[X0]], <16 x i8> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[RES4]] +; + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.db.mem.128(ptr %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovs_db_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_db_mem_128( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.db.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.db.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovs.db.mem.128(ptr %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.db.mem.128(ptr %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovus_db_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> [[X0]], <16 x i8> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> [[X0]], <16 x i8> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[RES4]] +; + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.db.mem.128(ptr %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovus_db_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_db_mem_128( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.db.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.db.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovus.db.mem.128(ptr %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.db.mem.128(ptr %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmov_db_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> [[X0]], <16 x i8> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> [[X0]], <16 x i8> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[RES4]] +; + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.db.mem.256(ptr %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmov_db_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_db_mem_256( +; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.db.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.db.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmov.db.mem.256(ptr %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.db.mem.256(ptr %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovs_db_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> [[X0]], <16 x i8> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> [[X0]], <16 x i8> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[RES4]] +; + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.db.mem.256(ptr %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovs_db_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_db_mem_256( +; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.db.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.db.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovs.db.mem.256(ptr %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.db.mem.256(ptr %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovus_db_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> [[X0]], <16 x i8> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> [[X0]], <16 x i8> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[RES4]] +; + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.db.mem.256(ptr %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovus_db_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_db_mem_256( +; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.db.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.db.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovus.db.mem.256(ptr %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.db.mem.256(ptr %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmov_dw_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> [[X0]], <8 x i16> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> [[X0]], <8 x i16> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[RES4]] +; + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.dw.mem.128(ptr %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmov_dw_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_dw_mem_128( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.dw.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.dw.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmov.dw.mem.128(ptr %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.dw.mem.128(ptr %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovs_dw_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> [[X0]], <8 x i16> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> [[X0]], <8 x i16> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[RES4]] +; + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.dw.mem.128(ptr %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovs_dw_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_dw_mem_128( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(ptr %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(ptr %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovus_dw_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> [[X0]], <8 x i16> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> [[X0]], <8 x i16> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[RES4]] +; + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.dw.mem.128(ptr %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovus_dw_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_dw_mem_128( +; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(ptr %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(ptr %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmov_dw_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> [[X0]], <8 x i16> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> [[X0]], <8 x i16> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[RES4]] +; + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.dw.mem.256(ptr %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmov_dw_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_dw_mem_256( +; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.dw.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.dw.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmov.dw.mem.256(ptr %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.dw.mem.256(ptr %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovs_dw_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> [[X0]], <8 x i16> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> [[X0]], <8 x i16> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[RES4]] +; + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.dw.mem.256(ptr %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovs_dw_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_dw_mem_256( +; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(ptr %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(ptr %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovus_dw_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> [[X0]], <8 x i16> [[X1]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> [[X0]], <8 x i16> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[RES4]] +; + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.dw.mem.256(ptr %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovus_dw_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_dw_mem_256( +; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(ptr %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(ptr %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2dq_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[RES2:%.*]] = add <4 x i32> [[RES]], [[RES1]] +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES2]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +define <4 x i32>@test_int_x86_avx512_cvt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1) #0 { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_pd2dq_128_zext( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[RES3:%.*]] = shufflevector <4 x i32> [[RES2]], <4 x i32> zeroinitializer, <4 x i32> +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES3]] +; + %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %res3 +} + +define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2dq_128_zext( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[RES1:%.*]] = shufflevector <4 x i32> [[RES]], <4 x i32> zeroinitializer, <4 x i32> +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES1]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) + %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %res1 +} + +declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_cvt_pd2ps(<2 x double> %x0, <4 x float> %x1) #0 { +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_cvt_pd2ps( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> [[X0]], <4 x float> [[X1]], i8 -1) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1) + ret <4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_cvt_pd2ps( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> [[X0]], <4 x float> [[X1]], i8 [[X2]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) + ret <4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_cvt_pd2ps_zext(<2 x double> %x0, <4 x float> %x1) #0 { +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_cvt_pd2ps_zext( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> [[X0]], <4 x float> [[X1]], i8 -1) +; CHECK-NEXT: [[RES3:%.*]] = shufflevector <4 x float> [[RES2]], <4 x float> zeroinitializer, <4 x i32> +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES3]] +; + %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1) + %res3 = shufflevector <4 x float> %res2, <4 x float> zeroinitializer, <4 x i32> + ret <4 x float> %res3 +} + +define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps_zext(<2 x double> %x0, <4 x float> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_cvt_pd2ps_zext( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> [[X0]], <4 x float> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[RES1:%.*]] = shufflevector <4 x float> [[RES]], <4 x float> zeroinitializer, <4 x i32> +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES1]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) + %res1 = shufflevector <4 x float> %res, <4 x float> zeroinitializer, <4 x i32> + ret <4 x float> %res1 +} + +declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_cvt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1) #0 { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_pd2udq_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2udq_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_cvt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1) #0 { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_pd2udq_128_zext( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[RES3:%.*]] = shufflevector <4 x i32> [[RES2]], <4 x i32> zeroinitializer, <4 x i32> +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES3]] +; + %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %res3 +} + +define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2udq_128_zext( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[RES1:%.*]] = shufflevector <4 x i32> [[RES]], <4 x i32> zeroinitializer, <4 x i32> +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES1]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) + %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %res1 +} + +declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_cvt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1) #0 { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_pd2udq_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> [[X0]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2udq_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_cvt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1) #0 { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_ps2dq_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> [[X0]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_ps2dq_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> [[X0]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_cvt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1) #0 { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_cvt_ps2dq_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> [[X0]], <8 x i32> [[X1]], i8 -1) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_cvt_ps2dq_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> [[X0]], <8 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_cvt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1) #0 { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_ps2udq_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> [[X0]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_ps2udq_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> [[X0]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_cvt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1) #0 { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_cvt_ps2udq_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> [[X0]], <8 x i32> [[X1]], i8 -1) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_cvt_ps2udq_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> [[X0]], <8 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_ask_cvtt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1) #0 { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_ask_cvtt_pd2dq_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2dq_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_cvtt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1) #0 { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_pd2dq_128_zext( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[RES3:%.*]] = shufflevector <4 x i32> [[RES2]], <4 x i32> zeroinitializer, <4 x i32> +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES3]] +; + %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %res3 +} + +define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2dq_128_zext( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[RES1:%.*]] = shufflevector <4 x i32> [[RES]], <4 x i32> zeroinitializer, <4 x i32> +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES1]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) + %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %res1 +} + +declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_cvtt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1) #0 { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_pd2udq_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2udq_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_cvtt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1) #0 { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_pd2udq_128_zext( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: [[RES3:%.*]] = shufflevector <4 x i32> [[RES2]], <4 x i32> zeroinitializer, <4 x i32> +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES3]] +; + %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %res3 +} + +define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2udq_128_zext( +; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: [[RES1:%.*]] = shufflevector <4 x i32> [[RES]], <4 x i32> zeroinitializer, <4 x i32> +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES1]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) + %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %res1 +} + +declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_cvtt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1) #0 { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_pd2udq_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> [[X0]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2udq_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_cvtt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1) #0 { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_ps2udq_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> [[X0]], <4 x i32> [[X1]], i8 -1) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_ps2udq_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> [[X0]], <4 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_cvtt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1) #0 { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_cvtt_ps2udq_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> [[X0]], <8 x i32> [[X1]], i8 -1) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_cvtt_ps2udq_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> [[X0]], <8 x i32> [[X1]], i8 [[X2]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +declare <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double>, i32, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask_rndscale_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_rndscale_pd_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> [[X0]], i32 4, <2 x double> [[X2]], i8 [[X3]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> [[X0]], i32 88, <2 x double> [[X2]], i8 -1) +; CHECK-NEXT: [[RES2:%.*]] = fadd <2 x double> [[RES]], [[RES1]] +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES2]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 4, <2 x double> %x2, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 88, <2 x double> %x2, i8 -1) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double>, i32, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask_rndscale_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_rndscale_pd_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> [[X0]], i32 4, <4 x double> [[X2]], i8 [[X3]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> [[X0]], i32 88, <4 x double> [[X2]], i8 -1) +; CHECK-NEXT: [[RES2:%.*]] = fadd <4 x double> [[RES]], [[RES1]] +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[RES2]] +; + %res = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 4, <4 x double> %x2, i8 %x3) + %res1 = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 88, <4 x double> %x2, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float>, i32, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_rndscale_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_rndscale_ps_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> [[X0]], i32 88, <4 x float> [[X2]], i8 [[X3]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> [[X0]], i32 4, <4 x float> [[X2]], i8 -1) +; CHECK-NEXT: [[RES2:%.*]] = fadd <4 x float> [[RES]], [[RES1]] +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES2]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 88, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 4, <4 x float> %x2, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float>, i32, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_rndscale_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_rndscale_ps_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> [[X0]], i32 5, <8 x float> [[X2]], i8 [[X3]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> [[X0]], i32 66, <8 x float> [[X2]], i8 -1) +; CHECK-NEXT: [[RES2:%.*]] = fadd <8 x float> [[RES]], [[RES1]] +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[RES2]] +; + %res = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 5, <8 x float> %x2, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 66, <8 x float> %x2, i8 -1) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double>, i32, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask_getmant_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_getmant_pd_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> [[X0]], i32 11, <2 x double> [[X2]], i8 [[X3]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB10]]: +; CHECK-NEXT: [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> [[X0]], i32 12, <2 x double> zeroinitializer, i8 [[X3]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> [[X0]], i32 13, <2 x double> [[X2]], i8 -1) +; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[RES2]], [[RES3]] +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES4]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> %x2, i8 %x3) + %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 12, <2 x double> zeroinitializer, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 13, <2 x double> %x2, i8 -1) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res2, %res3 + ret <2 x double> %res4 +} + +declare <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double>, i32, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask_getmant_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_getmant_pd_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> [[X0]], i32 11, <4 x double> [[X2]], i8 [[X3]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> [[X0]], i32 12, <4 x double> [[X2]], i8 -1) +; CHECK-NEXT: [[RES2:%.*]] = fadd <4 x double> [[RES]], [[RES1]] +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[RES2]] +; + %res = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %x0, i32 11, <4 x double> %x2, i8 %x3) + %res1 = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %x0, i32 12, <4 x double> %x2, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float>, i32, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_getmant_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_getmant_ps_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> [[X0]], i32 11, <4 x float> [[X2]], i8 [[X3]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> [[X0]], i32 12, <4 x float> [[X2]], i8 -1) +; CHECK-NEXT: [[RES2:%.*]] = fadd <4 x float> [[RES]], [[RES1]] +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES2]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %x0, i32 11, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %x0, i32 12, <4 x float> %x2, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float>, i32, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_getmant_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_getmant_ps_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> [[X0]], i32 11, <8 x float> [[X2]], i8 [[X3]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] +; CHECK: [[BB10]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> [[X0]], i32 12, <8 x float> [[X2]], i8 -1) +; CHECK-NEXT: [[RES2:%.*]] = fadd <8 x float> [[RES]], [[RES1]] +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[RES2]] +; + %res = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %x0, i32 12, <8 x float> %x2, i8 -1) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32) + +define <4 x i32>@test_int_x86_avx512_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) #0 { +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_pternlog_d_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP7]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]], i32 33) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; + %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33) + ret <4 x i32> %1 +} + +define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pternlog_d_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP8]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP9]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP14]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]], i32 33) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP10]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[X0]] +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP16]], <4 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP1]], <4 x i32> [[X0]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP7]] +; + %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0 + ret <4 x i32> %3 +} + +declare <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32, i8) + +define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) #0 { +; +; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_pternlog_d_128( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[TMP8]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP13]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP9]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP14]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i32> [[TMP10]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]], i32 33) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP12]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i32> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP6]], <4 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP7]] +; + %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer + ret <4 x i32> %3 +} + +declare <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32) + +define <8 x i32>@test_int_x86_avx512_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) #0 { +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_pternlog_d_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP7]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]], i32 33) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP1]] +; + %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33) + ret <8 x i32> %1 +} + +define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) #0 { +; +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_pternlog_d_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i32> [[TMP8]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP12]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP9]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP14]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]], i32 33) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP10]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> zeroinitializer, <8 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i32> [[TMP1]], [[X0]] +; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> [[TMP16]], <8 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[X0]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP7]] +; + %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0 + ret <8 x i32> %3 +} + +declare <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32, i8) + +define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) #0 { +; +; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_maskz_pternlog_d_256( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x i32> [[TMP8]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP13]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i32> [[TMP9]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP14]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i32> [[TMP10]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]], i32 33) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP12]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> zeroinitializer, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i32> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> [[TMP6]], <8 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> zeroinitializer +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP7]] +; + %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer + ret <8 x i32> %3 +} + +declare <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32) + +define <2 x i64>@test_int_x86_avx512_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) #0 { +; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_pternlog_q_128( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP7]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]], i32 33) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; + %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33) + ret <2 x i64> %1 +} + +define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) #0 { +; +; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_mask_pternlog_q_128( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP8]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP9]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP14]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]], i32 33) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP10]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> zeroinitializer, <2 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i64> [[TMP1]], [[X0]] +; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <2 x i64> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP16]], <2 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP7:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP1]], <2 x i64> [[X0]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP7]] +; + %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> + %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x0 + ret <2 x i64> %3 +} + +define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) #0 { +; +; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_maskz_pternlog_q_128( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <2 x i64> [[TMP8]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP13]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i64> [[TMP9]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP14]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP10]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]], i32 33) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP12]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> zeroinitializer, <2 x i64> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <2 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP6]], <2 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP1]], <2 x i64> zeroinitializer +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP7]] +; + %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> + %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> zeroinitializer + ret <2 x i64> %3 +} + +declare <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32) + +define <4 x i64>@test_int_x86_avx512_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) #0 { +; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_pternlog_q_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP7]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]], i32 33) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; + %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33) + ret <4 x i64> %1 +} + +define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) #0 { +; +; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_pternlog_q_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i64> [[TMP8]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP12]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i64> [[TMP9]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP14]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]], i32 33) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP10]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i64> [[TMP1]], [[X0]] +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i64> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP16]], <4 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[X0]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP7]] +; + %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x0 + ret <4 x i64> %3 +} + +define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) #0 { +; +; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_maskz_pternlog_q_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i64> [[TMP8]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP13]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i64> [[TMP9]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP14]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i64> [[TMP10]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]], i32 33) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP12]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> zeroinitializer, <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP6]], <4 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> zeroinitializer +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP7]] +; + %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> zeroinitializer + ret <4 x i64> %3 +} + +define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %src) #0 { +; +; CHECK-LABEL: define <8 x i16> @test_x86_vcvtps2ph_128( +; CHECK-SAME: <4 x float> [[A0:%.*]], i8 [[MASK:%.*]], <8 x i16> [[SRC:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> [[A0]], i32 2, <8 x i16> zeroinitializer, i8 -1) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> [[A0]], i32 10, <8 x i16> zeroinitializer, i8 [[MASK]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] +; CHECK-NEXT: br i1 [[_MSOR7]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]] +; CHECK: [[BB12]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB13]]: +; CHECK-NEXT: [[RES3:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> [[A0]], i32 11, <8 x i16> [[SRC]], i8 [[MASK]]) +; CHECK-NEXT: [[RES0:%.*]] = add <8 x i16> [[RES1]], [[RES2]] +; CHECK-NEXT: [[RES:%.*]] = add <8 x i16> [[RES3]], [[RES0]] +; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[RES]] +; + %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 10, <8 x i16> zeroinitializer, i8 %mask) + %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 11, <8 x i16> %src, i8 %mask) + %res0 = add <8 x i16> %res1, %res2 + %res = add <8 x i16> %res3, %res0 + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float>, i32, <8 x i16>, i8) nounwind readonly + +define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %src) #0 { +; +; CHECK-LABEL: define <8 x i16> @test_x86_vcvtps2ph_256( +; CHECK-SAME: <8 x float> [[A0:%.*]], i8 [[MASK:%.*]], <8 x i16> [[SRC:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> [[A0]], i32 2, <8 x i16> zeroinitializer, i8 -1) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> [[A0]], i32 11, <8 x i16> zeroinitializer, i8 [[MASK]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] +; CHECK-NEXT: br i1 [[_MSOR7]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]] +; CHECK: [[BB12]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB13]]: +; CHECK-NEXT: [[RES3:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> [[A0]], i32 12, <8 x i16> [[SRC]], i8 [[MASK]]) +; CHECK-NEXT: [[RES0:%.*]] = add <8 x i16> [[RES1]], [[RES2]] +; CHECK-NEXT: [[RES:%.*]] = add <8 x i16> [[RES3]], [[RES0]] +; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[RES]] +; + %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 11, <8 x i16> zeroinitializer, i8 %mask) + %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 12, <8 x i16> %src, i8 %mask) + %res0 = add <8 x i16> %res1, %res2 + %res = add <8 x i16> %res3, %res0 + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float>, i32, <8 x i16>, i8) nounwind readonly + +define <8 x float> @test_rsqrt_ps_256_rr(<8 x float> %a0) #0 { +; CHECK-LABEL: define <8 x float> @test_rsqrt_ps_256_rr( +; CHECK-SAME: <8 x float> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB4]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 -1) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[RES]] +; + %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1) + ret <8 x float> %res +} + +define <8 x float> @test_rsqrt_ps_256_rrkz(<8 x float> %a0, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x float> @test_rsqrt_ps_256_rrkz( +; CHECK-SAME: <8 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 [[MASK]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[RES]] +; + %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_rsqrt_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x float> @test_rsqrt_ps_256_rrk( +; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> [[A0]], <8 x float> [[A1]], i8 [[MASK]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[RES]] +; + %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask) + ret <8 x float> %res +} + +define <4 x float> @test_rsqrt_ps_128_rr(<4 x float> %a0) #0 { +; CHECK-LABEL: define <4 x float> @test_rsqrt_ps_128_rr( +; CHECK-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB4]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> [[A0]], <4 x float> zeroinitializer, i8 -1) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) + ret <4 x float> %res +} + +define <4 x float> @test_rsqrt_ps_128_rrkz(<4 x float> %a0, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_rsqrt_ps_128_rrkz( +; CHECK-SAME: <4 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> [[A0]], <4 x float> zeroinitializer, i8 [[MASK]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_rsqrt_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_rsqrt_ps_128_rrk( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> [[A0]], <4 x float> [[A1]], i8 [[MASK]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone +declare <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float>, <4 x float>, i8) nounwind readnone + +define <8 x float> @test_rcp_ps_256_rr(<8 x float> %a0) #0 { +; CHECK-LABEL: define <8 x float> @test_rcp_ps_256_rr( +; CHECK-SAME: <8 x float> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB4]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 -1) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[RES]] +; + %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1) + ret <8 x float> %res +} + +define <8 x float> @test_rcp_ps_256_rrkz(<8 x float> %a0, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x float> @test_rcp_ps_256_rrkz( +; CHECK-SAME: <8 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 [[MASK]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[RES]] +; + %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_rcp_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x float> @test_rcp_ps_256_rrk( +; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> [[A0]], <8 x float> [[A1]], i8 [[MASK]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[RES]] +; + %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask) + ret <8 x float> %res +} + +define <4 x float> @test_rcp_ps_128_rr(<4 x float> %a0) #0 { +; CHECK-LABEL: define <4 x float> @test_rcp_ps_128_rr( +; CHECK-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB4]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> [[A0]], <4 x float> zeroinitializer, i8 -1) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) + ret <4 x float> %res +} + +define <4 x float> @test_rcp_ps_128_rrkz(<4 x float> %a0, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_rcp_ps_128_rrkz( +; CHECK-SAME: <4 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> [[A0]], <4 x float> zeroinitializer, i8 [[MASK]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_rcp_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_rcp_ps_128_rrk( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> [[A0]], <4 x float> [[A1]], i8 [[MASK]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone +declare <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x double> @test_rsqrt_pd_256_rr(<4 x double> %a0) #0 { +; CHECK-LABEL: define <4 x double> @test_rsqrt_pd_256_rr( +; CHECK-SAME: <4 x double> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB4]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 -1) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[RES]] +; + %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1) + ret <4 x double> %res +} + +define <4 x double> @test_rsqrt_pd_256_rrkz(<4 x double> %a0, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x double> @test_rsqrt_pd_256_rrkz( +; CHECK-SAME: <4 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 [[MASK]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[RES]] +; + %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask) + ret <4 x double> %res +} + +define <4 x double> @test_rsqrt_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x double> @test_rsqrt_pd_256_rrk( +; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> [[A0]], <4 x double> [[A1]], i8 [[MASK]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[RES]] +; + %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask) + ret <4 x double> %res +} + +define <2 x double> @test_rsqrt_pd_128_rr(<2 x double> %a0) #0 { +; CHECK-LABEL: define <2 x double> @test_rsqrt_pd_128_rr( +; CHECK-SAME: <2 x double> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB4]]: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> [[A0]], <2 x double> zeroinitializer, i8 -1) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1) + ret <2 x double> %res +} + +define <2 x double> @test_rsqrt_pd_128_rrkz(<2 x double> %a0, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x double> @test_rsqrt_pd_128_rrkz( +; CHECK-SAME: <2 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> [[A0]], <2 x double> zeroinitializer, i8 [[MASK]]) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask) + ret <2 x double> %res +} + +define <2 x double> @test_rsqrt_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x double> @test_rsqrt_pd_128_rrk( +; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> [[A0]], <2 x double> [[A1]], i8 [[MASK]]) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask) + ret <2 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone +declare <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double>, <2 x double>, i8) nounwind readnone + +define <4 x double> @test_rcp_pd_256_rr(<4 x double> %a0) #0 { +; CHECK-LABEL: define <4 x double> @test_rcp_pd_256_rr( +; CHECK-SAME: <4 x double> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB4]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 -1) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[RES]] +; + %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1) + ret <4 x double> %res +} + +define <4 x double> @test_rcp_pd_256_rrkz(<4 x double> %a0, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x double> @test_rcp_pd_256_rrkz( +; CHECK-SAME: <4 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 [[MASK]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[RES]] +; + %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask) + ret <4 x double> %res +} + +define <4 x double> @test_rcp_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x double> @test_rcp_pd_256_rrk( +; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> [[A0]], <4 x double> [[A1]], i8 [[MASK]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[RES]] +; + %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask) + ret <4 x double> %res +} + +define <2 x double> @test_rcp_pd_128_rr(<2 x double> %a0) #0 { +; CHECK-LABEL: define <2 x double> @test_rcp_pd_128_rr( +; CHECK-SAME: <2 x double> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB4]]: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> [[A0]], <2 x double> zeroinitializer, i8 -1) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1) + ret <2 x double> %res +} + +define <2 x double> @test_rcp_pd_128_rrkz(<2 x double> %a0, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x double> @test_rcp_pd_128_rrkz( +; CHECK-SAME: <2 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> [[A0]], <2 x double> zeroinitializer, i8 [[MASK]]) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask) + ret <2 x double> %res +} + +define <2 x double> @test_rcp_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x double> @test_rcp_pd_128_rrk( +; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> [[A0]], <2 x double> [[A1]], i8 [[MASK]]) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask) + ret <2 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone +declare <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double>, <2 x double>, i8) nounwind readnone + +declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>) + +define <4 x double>@test_int_x86_avx512_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) #0 { +; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_permvar_df_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[X0]], <4 x i64> [[X1]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP1]] +; + %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1) + ret <4 x double> %1 +} + +define <4 x double>@test_int_x86_avx512_mask_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_permvar_df_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i64> [[TMP8]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i64> [[TMP11]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[X0]], <4 x i64> [[X1]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> zeroinitializer, <4 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i64> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = or <4 x i64> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i64> [[TMP7]], [[TMP13]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP17]], <4 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x double> [[TMP1]], <4 x double> [[X2]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP9]] +; + %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract1, <4 x double> %1, <4 x double> %x2 + ret <4 x double> %3 +} + +define <4 x double>@test_int_x86_avx512_maskz_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_maskz_permvar_df_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i64> [[TMP10]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP13]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i64> [[TMP11]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP14]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[X0]], <4 x i64> [[X1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP12]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> zeroinitializer, <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = or <4 x i64> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP7]], <4 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x double> [[TMP1]], <4 x double> zeroinitializer +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP8]] +; + %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract1, <4 x double> %1, <4 x double> zeroinitializer + ret <4 x double> %3 +} + +declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>) + +define <4 x i64>@test_int_x86_avx512_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) #0 { +; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_permvar_di_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[X0]], <4 x i64> [[X1]]) +; CHECK-NEXT: store <4 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; + %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1) + ret <4 x i64> %1 +} + +define <4 x i64>@test_int_x86_avx512_mask_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_permvar_di_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP5]], [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[X0]], <4 x i64> [[X1]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> [[_MSPROP]], <4 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i64> [[TMP1]], [[X2]] +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i64> [[TMP4]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i64> [[TMP11]], <4 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> [[TMP1]], <4 x i64> [[X2]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP7]] +; + %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract1, <4 x i64> %1, <4 x i64> %x2 + ret <4 x i64> %3 +} + +define <4 x i64>@test_int_x86_avx512_maskz_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_maskz_permvar_di_256( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[X0]], <4 x i64> [[X1]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> [[_MSPROP]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i64> [[TMP4]], [[_MSPROP]] +; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i64> [[TMP6]], <4 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> [[TMP1]], <4 x i64> zeroinitializer +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP7]] +; + %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract1, <4 x i64> %1, <4 x i64> zeroinitializer + ret <4 x i64> %3 +} + +declare <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double>, <2 x double>, <2 x i64>, i32, i8) + +define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) #0 { +; +; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_fixupimm_pd_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 5, i8 [[X4]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]] +; CHECK: [[BB12]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB13]]: +; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> [[X1]], <2 x i64> [[X2]], i32 4, i8 [[X4]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] +; CHECK-NEXT: br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]] +; CHECK: [[BB17]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB18]]: +; CHECK-NEXT: [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 3, i8 -1) +; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[RES3]], [[RES2]] +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES4]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1,<2 x i64> %x2, i32 5, i8 %x4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 %x4) + %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 3, i8 -1) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res3, %res2 + ret <2 x double> %res4 +} + +declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double>, <2 x double>, <2 x i64>, i32, i8) + +define <2 x double>@test_int_x86_avx512_maskz_fixupimm_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) #0 { +; +; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_maskz_fixupimm_pd_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 5, i8 [[X4]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]] +; CHECK: [[BB12]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB13]]: +; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> zeroinitializer, i32 3, i8 [[X4]]) +; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]] +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES3]] +; + %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4) + %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 3, i8 %x4) + ;%res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 -1) + %res3 = fadd <2 x double> %res, %res1 + ;%res4 = fadd <2 x double> %res3, %res2 + ret <2 x double> %res3 +} + +declare <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double>, <4 x double>, <4 x i64>, i32, i8) + +define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i8 %x4) #0 { +; +; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_fixupimm_pd_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x i64> [[X2]], i32 4, i8 [[X4]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i256 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]] +; CHECK: [[BB12]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB13]]: +; CHECK-NEXT: [[RES1:%.*]] = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> [[X1]], <4 x i64> [[X2]], i32 5, i8 [[X4]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i256 [[TMP14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i256 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i256 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] +; CHECK-NEXT: br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]] +; CHECK: [[BB17]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB18]]: +; CHECK-NEXT: [[RES2:%.*]] = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x i64> [[X2]], i32 3, i8 -1) +; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x double> [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x double> [[RES3]], [[RES2]] +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[RES4]] +; + %res = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 4, i8 %x4) + %res1 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> %x1, <4 x i64> %x2 , i32 5, i8 %x4) + %res2 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 3, i8 -1) + %res3 = fadd <4 x double> %res, %res1 + %res4 = fadd <4 x double> %res3, %res2 + ret <4 x double> %res4 +} + +declare <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double>, <4 x double>, <4 x i64>, i32, i8) + +define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i8 %x4) #0 { +; +; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_maskz_fixupimm_pd_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x i64> [[X2]], i32 5, i8 [[X4]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i256 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]] +; CHECK: [[BB12]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB13]]: +; CHECK-NEXT: [[RES1:%.*]] = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x i64> zeroinitializer, i32 4, i8 [[X4]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i256 [[TMP14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i256 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i256 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] +; CHECK-NEXT: br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]] +; CHECK: [[BB17]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB18]]: +; CHECK-NEXT: [[RES2:%.*]] = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x i64> [[X2]], i32 3, i8 -1) +; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x double> [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x double> [[RES3]], [[RES2]] +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[RES4]] +; + %res = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 5, i8 %x4) + %res1 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> zeroinitializer, i32 4, i8 %x4) + %res2 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 3, i8 -1) + %res3 = fadd <4 x double> %res, %res1 + %res4 = fadd <4 x double> %res3, %res2 + ret <4 x double> %res4 +} + +declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float>, <4 x float>, <4 x i32>, i32, i8) + +define <4 x float>@test_int_x86_avx512_mask_fixupimm_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) #0 { +; +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_fixupimm_ps_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 5, i8 [[X4]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]] +; CHECK: [[BB12]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB13]]: +; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> zeroinitializer, i32 6, i8 [[X4]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] +; CHECK-NEXT: br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]] +; CHECK: [[BB17]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB18]]: +; CHECK-NEXT: [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 7, i8 -1) +; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[RES3]], [[RES2]] +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES4]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4) + %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 6, i8 %x4) + %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 7, i8 -1) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res3, %res2 + ret <4 x float> %res4 +} + +declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float>, <4 x float>, <4 x i32>, i32, i8) + +define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) #0 { +; +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_maskz_fixupimm_ps_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 5, i8 [[X4]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]] +; CHECK: [[BB12]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB13]]: +; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> zeroinitializer, i32 6, i8 [[X4]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] +; CHECK-NEXT: br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]] +; CHECK: [[BB17]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB18]]: +; CHECK-NEXT: [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 7, i8 -1) +; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[RES3]], [[RES2]] +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES4]] +; + %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4) + %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 6, i8 %x4) + %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 7, i8 -1) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res3, %res2 + ret <4 x float> %res4 +} + +declare <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float>, <8 x float>, <8 x i32>, i32, i8) + +define <8 x float>@test_int_x86_avx512_mask_fixupimm_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i8 %x4) #0 { +; +; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_fixupimm_ps_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x i32> [[X2]], i32 5, i8 [[X4]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i256 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]] +; CHECK: [[BB12]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB13]]: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x i32> zeroinitializer, i32 6, i8 [[X4]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i256 [[TMP14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i256 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i256 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] +; CHECK-NEXT: br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]] +; CHECK: [[BB17]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB18]]: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x i32> [[X2]], i32 7, i8 -1) +; CHECK-NEXT: [[RES3:%.*]] = fadd <8 x float> [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <8 x float> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[RES4]] +; + %res = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 %x4) + %res1 = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> zeroinitializer, i32 6, i8 %x4) + %res2 = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 7, i8 -1) + %res3 = fadd <8 x float> %res, %res1 + %res4 = fadd <8 x float> %res3, %res2 + ret <8 x float> %res4 +} + +declare <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float>, <8 x float>, <8 x i32>, i32, i8) + +define <8 x float>@test_int_x86_avx512_maskz_fixupimm_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i8 %x4) #0 { +; +; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_maskz_fixupimm_ps_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x i32> [[X2]], i32 5, i8 [[X4]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i256 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]] +; CHECK: [[BB12]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB13]]: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x i32> zeroinitializer, i32 6, i8 [[X4]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i256 [[TMP14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i256 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i256 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] +; CHECK-NEXT: br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]] +; CHECK: [[BB17]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB18]]: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x i32> [[X2]], i32 7, i8 -1) +; CHECK-NEXT: [[RES3:%.*]] = fadd <8 x float> [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <8 x float> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[RES4]] +; + %res = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 %x4) + %res1 = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> zeroinitializer, i32 6, i8 %x4) + %res2 = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 7, i8 -1) + %res3 = fadd <8 x float> %res, %res1 + %res4 = fadd <8 x float> %res3, %res2 + ret <8 x float> %res4 +} + +define <2 x i64> @test_x86_avx512_psra_q_128(<2 x i64> %a0, <2 x i64> %a1) #0 { +; CHECK-LABEL: define <2 x i64> @test_x86_avx512_psra_q_128( +; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i128 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i128 [[TMP6]] to <2 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[TMP1]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[A0]], <2 x i64> [[A1]]) +; CHECK-NEXT: store <2 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[RES]] +; + %res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +define <2 x i64> @test_x86_avx512_mask_psra_q_128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x i64> @test_x86_avx512_mask_psra_q_128( +; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <2 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i128 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i128 [[TMP8]] to <2 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[TMP1]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP11:%.*]] = or <2 x i64> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[A0]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> splat (i1 true), <2 x i32> +; CHECK-NEXT: [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> undef, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[TMP11]], <2 x i64> [[TMP14]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i64> [[RES]], [[PASSTHRU]] +; CHECK-NEXT: [[TMP15:%.*]] = or <2 x i64> [[TMP4]], [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = or <2 x i64> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP16]], <2 x i64> [[TMP13]] +; CHECK-NEXT: [[RES2:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[RES]], <2 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[RES2]] +; + %res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> %passthru + ret <2 x i64> %res2 +} +define <2 x i64> @test_x86_avx512_maskz_psra_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x i64> @test_x86_avx512_maskz_psra_q_128( +; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64 +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP15]] to i128 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i128 [[TMP7]] to <2 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[TMP1]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP10:%.*]] = or <2 x i64> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[A0]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> splat (i1 true), <2 x i32> +; CHECK-NEXT: [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> undef, <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[TMP10]], <2 x i64> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i64> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i64> [[TMP4]], [[TMP10]] +; CHECK-NEXT: [[TMP6:%.*]] = or <2 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP6]], <2 x i64> [[TMP12]] +; CHECK-NEXT: [[RES2:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[RES]], <2 x i64> zeroinitializer +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[RES2]] +; + %res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> zeroinitializer + ret <2 x i64> %res2 +} +declare <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64>, <2 x i64>) nounwind readnone + + +define <4 x i64> @test_x86_avx512_psra_q_256(<4 x i64> %a0, <2 x i64> %a1) #0 { +; CHECK-LABEL: define <4 x i64> @test_x86_avx512_psra_q_256( +; CHECK-SAME: <4 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i256 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i256 [[TMP6]] to <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[TMP1]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[A0]], <2 x i64> [[A1]]) +; CHECK-NEXT: store <4 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[RES]] +; + %res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +define <4 x i64> @test_x86_avx512_mask_psra_q_256(<4 x i64> %a0, <2 x i64> %a1, <4 x i64> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x i64> @test_x86_avx512_mask_psra_q_256( +; CHECK-SAME: <4 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i256 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i256 [[TMP8]] to <4 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[TMP1]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP11:%.*]] = or <4 x i64> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[A0]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> splat (i1 true), <4 x i32> +; CHECK-NEXT: [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[TMP11]], <4 x i64> [[TMP14]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i64> [[RES]], [[PASSTHRU]] +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i64> [[TMP4]], [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i64> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP16]], <4 x i64> [[TMP13]] +; CHECK-NEXT: [[RES2:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[RES]], <4 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[RES2]] +; + %res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> %passthru + ret <4 x i64> %res2 +} +define <4 x i64> @test_x86_avx512_maskz_psra_q_256(<4 x i64> %a0, <2 x i64> %a1, <4 x i64> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x i64> @test_x86_avx512_maskz_psra_q_256( +; CHECK-SAME: <4 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64 +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP15]] to i256 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i256 [[TMP7]] to <4 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[TMP1]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i64> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[A0]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> splat (i1 true), <4 x i32> +; CHECK-NEXT: [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[TMP10]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i64> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i64> [[TMP4]], [[TMP10]] +; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP6]], <4 x i64> [[TMP12]] +; CHECK-NEXT: [[RES2:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[RES]], <4 x i64> zeroinitializer +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[RES2]] +; + %res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> zeroinitializer + ret <4 x i64> %res2 +} +declare <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64>, <2 x i64>) nounwind readnone + + +define <2 x i64> @test_x86_avx512_psrai_q_128(<2 x i64> %a0) #0 { +; CHECK-LABEL: define <2 x i64> @test_x86_avx512_psrai_q_128( +; CHECK-SAME: <2 x i64> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[TMP3]], i32 7) +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[A0]], i32 7) +; CHECK-NEXT: store <2 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[RES]] +; + %res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +define <2 x i64> @test_x86_avx512_mask_psrai_q_128(<2 x i64> %a0, <2 x i64> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x i64> @test_x86_avx512_mask_psrai_q_128( +; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[TMP5]], i32 7) +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[A0]], i32 7) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP8]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> splat (i1 true), <2 x i32> +; CHECK-NEXT: [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> undef, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i64> [[RES]], [[PASSTHRU]] +; CHECK-NEXT: [[TMP9:%.*]] = or <2 x i64> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = or <2 x i64> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP10]], <2 x i64> [[TMP7]] +; CHECK-NEXT: [[RES2:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[RES]], <2 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[RES2]] +; + %res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> %passthru + ret <2 x i64> %res2 +} +define <2 x i64> @test_x86_avx512_maskz_psrai_q_128(<2 x i64> %a0, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x i64> @test_x86_avx512_maskz_psrai_q_128( +; CHECK-SAME: <2 x i64> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[TMP3]], i32 7) +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[A0]], i32 7) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP7]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP8]], <8 x i1> splat (i1 true), <2 x i32> +; CHECK-NEXT: [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> undef, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[TMP2]], <2 x i64> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i64> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i64> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = or <2 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP6]], <2 x i64> [[TMP9]] +; CHECK-NEXT: [[RES2:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[RES]], <2 x i64> zeroinitializer +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[RES2]] +; + %res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> zeroinitializer + ret <2 x i64> %res2 +} +declare <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64>, i32) nounwind readnone + + +define <4 x i64> @test_x86_avx512_psrai_q_256(<4 x i64> %a0) #0 { +; CHECK-LABEL: define <4 x i64> @test_x86_avx512_psrai_q_256( +; CHECK-SAME: <4 x i64> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[TMP3]], i32 7) +; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[A0]], i32 7) +; CHECK-NEXT: store <4 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[RES]] +; + %res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +define <4 x i64> @test_x86_avx512_mask_psrai_q_256(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x i64> @test_x86_avx512_mask_psrai_q_256( +; CHECK-SAME: <4 x i64> [[A0:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[TMP5]], i32 7) +; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[A0]], i32 7) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP8]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> splat (i1 true), <4 x i32> +; CHECK-NEXT: [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[TMP2]], <4 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i64> [[RES]], [[PASSTHRU]] +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i64> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i64> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP10]], <4 x i64> [[TMP7]] +; CHECK-NEXT: [[RES2:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[RES]], <4 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[RES2]] +; + %res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> %passthru + ret <4 x i64> %res2 +} +define <4 x i64> @test_x86_avx512_maskz_psrai_q_256(<4 x i64> %a0, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x i64> @test_x86_avx512_maskz_psrai_q_256( +; CHECK-SAME: <4 x i64> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[TMP3]], i32 7) +; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[A0]], i32 7) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP7]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP8]], <8 x i1> splat (i1 true), <4 x i32> +; CHECK-NEXT: [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[TMP2]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i64> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i64> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP6]], <4 x i64> [[TMP9]] +; CHECK-NEXT: [[RES2:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[RES]], <4 x i64> zeroinitializer +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[RES2]] +; + %res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> zeroinitializer + ret <4 x i64> %res2 +} +declare <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64>, i32) nounwind readnone + +define <2 x i64> @test_x86_avx512_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1) #0 { +; CHECK-LABEL: define <2 x i64> @test_x86_avx512_psrav_q_128( +; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> [[TMP1]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <2 x i64> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> [[A0]], <2 x i64> [[A1]]) +; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[RES]] +; + %res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1) + ret <2 x i64> %res +} + +define <2 x i64> @test_x86_avx512_mask_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x i64> @test_x86_avx512_mask_psrav_q_128( +; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <2 x i64> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> [[TMP1]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <2 x i64> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> [[A0]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> splat (i1 true), <2 x i32> +; CHECK-NEXT: [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> undef, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[TMP8]], <2 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i64> [[RES]], [[A2]] +; CHECK-NEXT: [[TMP12:%.*]] = or <2 x i64> [[TMP4]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = or <2 x i64> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP13]], <2 x i64> [[TMP10]] +; CHECK-NEXT: [[RES2:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[RES]], <2 x i64> [[A2]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[RES2]] +; + %res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> %a2 + ret <2 x i64> %res2 +} + +define <2 x i64> @test_x86_avx512_maskz_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x i64> @test_x86_avx512_maskz_psrav_q_128( +; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> [[TMP1]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <2 x i64> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> [[A0]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP8]], <8 x i1> splat (i1 true), <2 x i32> +; CHECK-NEXT: [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> undef, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[TMP7]], <2 x i64> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i64> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i64> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP6:%.*]] = or <2 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP6]], <2 x i64> [[TMP9]] +; CHECK-NEXT: [[RES2:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[RES]], <2 x i64> zeroinitializer +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[RES2]] +; + %res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> zeroinitializer + ret <2 x i64> %res2 +} + +declare <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64>, <2 x i64>) nounwind readnone + +define <4 x i64> @test_x86_avx512_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1) #0 { +; CHECK-LABEL: define <4 x i64> @test_x86_avx512_psrav_q_256( +; CHECK-SAME: <4 x i64> [[A0:%.*]], <4 x i64> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> [[TMP1]], <4 x i64> [[A1]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i64> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> [[A0]], <4 x i64> [[A1]]) +; CHECK-NEXT: store <4 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[RES]] +; + %res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1) + ret <4 x i64> %res +} + +define <4 x i64> @test_x86_avx512_mask_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x i64> @test_x86_avx512_mask_psrav_q_256( +; CHECK-SAME: <4 x i64> [[A0:%.*]], <4 x i64> [[A1:%.*]], <4 x i64> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> [[TMP1]], <4 x i64> [[A1]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i64> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> [[A0]], <4 x i64> [[A1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> splat (i1 true), <4 x i32> +; CHECK-NEXT: [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[TMP8]], <4 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i64> [[RES]], [[A2]] +; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i64> [[TMP4]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i64> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP13]], <4 x i64> [[TMP10]] +; CHECK-NEXT: [[RES2:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[RES]], <4 x i64> [[A2]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[RES2]] +; + %res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> %a2 + ret <4 x i64> %res2 +} + +define <4 x i64> @test_x86_avx512_maskz_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x i64> @test_x86_avx512_maskz_psrav_q_256( +; CHECK-SAME: <4 x i64> [[A0:%.*]], <4 x i64> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> [[TMP1]], <4 x i64> [[A1]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <4 x i64> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> [[A0]], <4 x i64> [[A1]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP8]], <8 x i1> splat (i1 true), <4 x i32> +; CHECK-NEXT: [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[TMP7]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i64> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i64> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP6]], <4 x i64> [[TMP9]] +; CHECK-NEXT: [[RES2:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[RES]], <4 x i64> zeroinitializer +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[RES2]] +; + %res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> zeroinitializer + ret <4 x i64> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64>, <4 x i64>) nounwind readnone + +define <8 x float> @test_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-LABEL: define <8 x float> @test_vfmadd256_ps( +; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A0]], <8 x float> [[A1]], <8 x float> [[A2]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP1]] +; + %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %1 +} + +define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x float> @test_mask_vfmadd256_ps( +; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A0]], <8 x float> [[A1]], <8 x float> [[A2]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x float> [[A0]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = xor <8 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i32> [[TMP6]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i32> [[TMP12]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP14]], <8 x i32> [[TMP13]], <8 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[A0]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP9]] +; + %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %a0 + ret <8 x float> %3 +} + +define <4 x float> @test_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-LABEL: define <4 x float> @test_vfmadd128_ps( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %1 +} + +define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i32> [[TMP12]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP13]], <4 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[A0]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP9]] +; + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %a0 + ret <4 x float> %3 +} + +define <4 x double> @test_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) #0 { +; CHECK-LABEL: define <4 x double> @test_fmadd256_pd( +; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x double> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A]], <4 x double> [[B]], <4 x double> [[C]]) +; CHECK-NEXT: store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP1]] +; + %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c) + ret <4 x double> %1 +} + +define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x double> @test_mask_fmadd256_pd( +; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x double> [[C:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A]], <4 x double> [[B]], <4 x double> [[C]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP1]], <4 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x double> [[A]] to <4 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i64> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i64> [[TMP6]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i64> [[TMP12]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i64> [[TMP13]], <4 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[A]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP9]] +; + %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %a + ret <4 x double> %3 +} + +define <2 x double> @test_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) #0 { +; CHECK-LABEL: define <2 x double> @test_fmadd128_pd( +; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]]) +; CHECK-NEXT: store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP1]] +; + %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) + ret <2 x double> %1 +} + +define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x double> @test_mask_fmadd128_pd( +; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP1]], <2 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x double> [[TMP1]] to <2 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i64> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <2 x i64> [[TMP6]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP13:%.*]] = or <2 x i64> [[TMP12]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP2]], <2 x i64> [[TMP13]], <2 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP1]], <2 x double> [[A]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP9]] +; + %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> + %3 = select <2 x i1> %extract, <2 x double> %1, <2 x double> %a + ret <2 x double> %3 +} + +define <2 x double>@test_int_x86_avx512_mask3_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfmadd_pd_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP1]], <2 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x double> [[TMP1]] to <2 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x double> [[X2]] to <2 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i64> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <2 x i64> [[TMP6]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP13:%.*]] = or <2 x i64> [[TMP12]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP2]], <2 x i64> [[TMP13]], <2 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP1]], <2 x double> [[X2]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP9]] +; + %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> + %3 = select <2 x i1> %extract, <2 x double> %1, <2 x double> %x2 + ret <2 x double> %3 +} + +define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_maskz_vfmadd_pd_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP11]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP1]], <2 x i64> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x double> [[TMP1]] to <2 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <2 x i64> [[TMP5]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP7:%.*]] = or <2 x i64> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP2]], <2 x i64> [[TMP7]], <2 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP1]], <2 x double> zeroinitializer +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP8]] +; + %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> + %3 = select <2 x i1> %extract, <2 x double> %1, <2 x double> zeroinitializer + ret <2 x double> %3 +} + +define <4 x double>@test_int_x86_avx512_mask3_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfmadd_pd_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP1]], <4 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i64> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i64> [[TMP6]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i64> [[TMP12]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i64> [[TMP13]], <4 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[X2]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP9]] +; + %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %x2 + ret <4 x double> %3 +} + +define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_maskz_vfmadd_pd_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP11]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP1]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i64> [[TMP5]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP7:%.*]] = or <4 x i64> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i64> [[TMP7]], <4 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> zeroinitializer +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP8]] +; + %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> zeroinitializer + ret <4 x double> %3 +} + +define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ps_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x float> [[X2]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i32> [[TMP12]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP13]], <4 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[X2]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP9]] +; + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %x2 + ret <4 x float> %3 +} + +define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_maskz_vfmadd_ps_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP11]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i32> [[TMP5]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP7:%.*]] = or <4 x i32> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP7]], <4 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> zeroinitializer +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP8]] +; + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> zeroinitializer + ret <4 x float> %3 +} + +define <8 x float>@test_int_x86_avx512_mask3_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfmadd_ps_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x float> [[X2]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = xor <8 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i32> [[TMP6]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i32> [[TMP12]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP14]], <8 x i32> [[TMP13]], <8 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[X2]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP9]] +; + %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %x2 + ret <8 x float> %3 +} + +define <8 x float>@test_int_x86_avx512_maskz_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_maskz_vfmadd_ps_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP11]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = xor <8 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i32> [[TMP5]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i32> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP7]], <8 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> zeroinitializer +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP8]] +; + %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer + ret <8 x float> %3 +} + +define <2 x double>@test_int_x86_avx512_mask3_vfmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfmsub_pd_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <2 x i64> [[_MSPROP1]], [[_MSPROP]] +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[TMP1]]) +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP2]], <2 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x double> [[TMP2]] to <2 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x double> [[X2]] to <2 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i64> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = or <2 x i64> [[TMP7]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP14:%.*]] = or <2 x i64> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP3]], <2 x i64> [[TMP14]], <2 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP2]], <2 x double> [[X2]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP10]] +; + %1 = fsub <2 x double> , %x2 + %2 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %1) + %3 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <2 x i32> + %4 = select <2 x i1> %extract, <2 x double> %2, <2 x double> %x2 + ret <2 x double> %4 +} + +define <4 x double>@test_int_x86_avx512_mask3_vfmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfmsub_pd_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i64> [[_MSPROP1]], [[_MSPROP]] +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[TMP1]]) +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP2]], <4 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x double> [[TMP2]] to <4 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i64> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i64> [[TMP7]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i64> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i64> [[TMP14]], <4 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP2]], <4 x double> [[X2]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP10]] +; + %1 = fsub <4 x double> , %x2 + %2 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %1) + %3 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> + %4 = select <4 x i1> %extract, <4 x double> %2, <4 x double> %x2 + ret <4 x double> %4 +} + +define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ps_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP]] +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[TMP1]]) +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP2]], <4 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x float> [[X2]] to <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i32> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP14]], <4 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP2]], <4 x float> [[X2]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP10]] +; + %1 = fsub <4 x float> , %x2 + %2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %1) + %3 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> + %4 = select <4 x i1> %extract, <4 x float> %2, <4 x float> %x2 + ret <4 x float> %4 +} + +define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfmsub_ps_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[TMP1]]) +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[_MSPROP2]], <8 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x float> [[TMP2]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x float> [[X2]] to <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i32> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP15]], <8 x i32> [[TMP14]], <8 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP3]], <8 x float> [[TMP2]], <8 x float> [[X2]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP10]] +; + %1 = fsub <8 x float> , %x2 + %2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %1) + %3 = bitcast i8 %x3 to <8 x i1> + %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %x2 + ret <8 x float> %4 +} + +define <8 x float> @test_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-LABEL: define <8 x float> @test_vfnmadd256_ps( +; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A1]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP5]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[_MSPROP1]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A0]], <8 x float> [[TMP1]], <8 x float> [[A2]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP2]] +; + %1 = fsub <8 x float> , %a1 + %2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %1, <8 x float> %a2) + ret <8 x float> %2 +} + +define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x float> @test_mask_vfnmadd256_ps( +; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A1]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP11]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[_MSPROP1]], [[TMP12]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A0]], <8 x float> [[TMP1]], <8 x float> [[A2]]) +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[_MSPROP2]], <8 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x float> [[TMP2]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x float> [[A0]] to <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i32> [[TMP13]], [[TMP11]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP15]], <8 x i32> [[TMP14]], <8 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP3]], <8 x float> [[TMP2]], <8 x float> [[A0]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP10]] +; + %1 = fsub <8 x float> , %a1 + %2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %1, <8 x float> %a2) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %a0 + ret <8 x float> %4 +} + +define <4 x float> @test_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-LABEL: define <4 x float> @test_vfnmadd128_ps( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A1]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP5]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[TMP1]], <4 x float> [[A2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP2]] +; + %1 = fsub <4 x float> , %a1 + %2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %1, <4 x float> %a2) + ret <4 x float> %2 +} + +define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_mask_vfnmadd128_ps( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A1]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP11]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP12]] +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[TMP1]], <4 x float> [[A2]]) +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP2]], <4 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i32> [[TMP13]], [[TMP11]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP14]], <4 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP2]], <4 x float> [[A0]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP10]] +; + %1 = fsub <4 x float> , %a1 + %2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %1, <4 x float> %a2) + %3 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> + %4 = select <4 x i1> %extract, <4 x float> %2, <4 x float> %a0 + ret <4 x float> %4 +} + +define <4 x double> @test_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-LABEL: define <4 x double> @test_vfnmadd256_pd( +; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A1]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[TMP5]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[TMP1]], <4 x double> [[A2]]) +; CHECK-NEXT: store <4 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP2]] +; + %1 = fsub <4 x double> , %a1 + %2 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %1, <4 x double> %a2) + ret <4 x double> %2 +} + +define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x double> @test_mask_vfnmadd256_pd( +; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A1]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[TMP11]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP12]] +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[TMP1]], <4 x double> [[A2]]) +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP2]], <4 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x double> [[TMP2]] to <4 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x double> [[A0]] to <4 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i64> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i64> [[TMP7]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i64> [[TMP13]], [[TMP11]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i64> [[TMP14]], <4 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP2]], <4 x double> [[A0]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP10]] +; + %1 = fsub <4 x double> , %a1 + %2 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %1, <4 x double> %a2) + %3 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> + %4 = select <4 x i1> %extract, <4 x double> %2, <4 x double> %a0 + ret <4 x double> %4 +} + +define <2 x double> @test_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-LABEL: define <2 x double> @test_vfnmadd128_pd( +; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A1]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[TMP5]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[TMP1]], <2 x double> [[A2]]) +; CHECK-NEXT: store <2 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP2]] +; + %1 = fsub <2 x double> , %a1 + %2 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %1, <2 x double> %a2) + ret <2 x double> %2 +} + +define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x double> @test_mask_vfnmadd128_pd( +; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A1]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[TMP11]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP12]] +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[TMP1]], <2 x double> [[A2]]) +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP2]], <2 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x double> [[TMP2]] to <2 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x double> [[A0]] to <2 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i64> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = or <2 x i64> [[TMP7]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP14:%.*]] = or <2 x i64> [[TMP13]], [[TMP11]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP3]], <2 x i64> [[TMP14]], <2 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP2]], <2 x double> [[A0]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP10]] +; + %1 = fsub <2 x double> , %a1 + %2 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %1, <2 x double> %a2) + %3 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <2 x i32> + %4 = select <2 x i1> %extract, <2 x double> %2, <2 x double> %a0 + ret <2 x double> %4 +} + +define <8 x float> @test_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-LABEL: define <8 x float> @test_vfnmsub256_ps( +; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A1]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A2]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[TMP6]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A0]], <8 x float> [[TMP1]], <8 x float> [[TMP2]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP3]] +; + %1 = fsub <8 x float> , %a1 + %2 = fsub <8 x float> , %a2 + %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %1, <8 x float> %2) + ret <8 x float> %3 +} + +define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x float> @test_mask_vfnmsub256_ps( +; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A1]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> zeroinitializer, [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A2]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[TMP12]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A0]], <8 x float> [[TMP1]], <8 x float> [[TMP2]]) +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[_MSPROP3]], <8 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x float> [[TMP3]] to <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x float> [[A0]] to <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i32> [[TMP8]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i32> [[TMP14]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i32> [[TMP15]], <8 x i32> [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP4]], <8 x float> [[TMP3]], <8 x float> [[A0]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP11]] +; + %1 = fsub <8 x float> , %a1 + %2 = fsub <8 x float> , %a2 + %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %1, <8 x float> %2) + %4 = bitcast i8 %mask to <8 x i1> + %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %a0 + ret <8 x float> %5 +} + +define <4 x float> @test_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-LABEL: define <4 x float> @test_vfnmsub128_ps( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A1]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A2]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP3]] +; + %1 = fsub <4 x float> , %a1 + %2 = fsub <4 x float> , %a2 + %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %1, <4 x float> %2) + ret <4 x float> %3 +} + +define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_mask_vfnmsub128_ps( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A1]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> zeroinitializer, [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A2]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[TMP12]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]]) +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP4:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> [[TMP16]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP3]], <4 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x float> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i32> [[TMP8]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i32> [[TMP14]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP4]], <4 x i32> [[TMP15]], <4 x i32> [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP3]], <4 x float> [[A0]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP11]] +; + %1 = fsub <4 x float> , %a1 + %2 = fsub <4 x float> , %a2 + %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %1, <4 x float> %2) + %4 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <4 x i32> + %5 = select <4 x i1> %extract, <4 x float> %3, <4 x float> %a0 + ret <4 x float> %5 +} + +define <4 x double> @test_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-LABEL: define <4 x double> @test_vfnmsub256_pd( +; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A1]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A2]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i64> [[TMP6]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i64> [[_MSPROP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[TMP1]], <4 x double> [[TMP2]]) +; CHECK-NEXT: store <4 x i64> [[_MSPROP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP3]] +; + %1 = fsub <4 x double> , %a1 + %2 = fsub <4 x double> , %a2 + %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %1, <4 x double> %2) + ret <4 x double> %3 +} + +define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x double> @test_mask_vfnmsub256_pd( +; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A1]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> zeroinitializer, [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A2]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i64> [[TMP12]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i64> [[_MSPROP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[TMP1]], <4 x double> [[TMP2]]) +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP4:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> [[TMP16]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP3]], <4 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x double> [[TMP3]] to <4 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x double> [[A0]] to <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i64> [[TMP8]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i64> [[TMP14]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP4]], <4 x i64> [[TMP15]], <4 x i64> [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP3]], <4 x double> [[A0]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP11]] +; + %1 = fsub <4 x double> , %a1 + %2 = fsub <4 x double> , %a2 + %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %1, <4 x double> %2) + %4 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <4 x i32> + %5 = select <4 x i1> %extract, <4 x double> %3, <4 x double> %a0 + ret <4 x double> %5 +} + +define <2 x double> @test_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-LABEL: define <2 x double> @test_vfnmsub128_pd( +; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A1]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A2]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <2 x i64> [[TMP6]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <2 x i64> [[_MSPROP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]]) +; CHECK-NEXT: store <2 x i64> [[_MSPROP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP3]] +; + %1 = fsub <2 x double> , %a1 + %2 = fsub <2 x double> , %a2 + %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %1, <2 x double> %2) + ret <2 x double> %3 +} + +define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x double> @test_mask_vfnmsub128_pd( +; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A1]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A2]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <2 x i64> [[TMP12]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <2 x i64> [[_MSPROP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]]) +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP4:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> [[TMP16]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP3]], <2 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x double> [[TMP3]] to <2 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x double> [[A0]] to <2 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <2 x i64> [[TMP8]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP15:%.*]] = or <2 x i64> [[TMP14]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP4]], <2 x i64> [[TMP15]], <2 x i64> [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP3]], <2 x double> [[A0]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP11]] +; + %1 = fsub <2 x double> , %a1 + %2 = fsub <2 x double> , %a2 + %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %1, <2 x double> %2) + %4 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <2 x i32> + %5 = select <2 x i1> %extract, <2 x double> %3, <2 x double> %a0 + ret <2 x double> %5 +} + +define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfnmsub_pd_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X0]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <2 x i64> [[_MSPROP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP1]], <2 x double> [[X1]], <2 x double> [[TMP2]]) +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP4:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> [[TMP16]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP3]], <2 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x double> [[TMP3]] to <2 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x double> [[X2]] to <2 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <2 x i64> [[TMP8]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP15:%.*]] = or <2 x i64> [[TMP14]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP4]], <2 x i64> [[TMP15]], <2 x i64> [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP3]], <2 x double> [[X2]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP11]] +; + %1 = fsub <2 x double> , %x0 + %2 = fsub <2 x double> , %x2 + %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %1, <2 x double> %x1, <2 x double> %2) + %4 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <2 x i32> + %5 = select <2 x i1> %extract, <2 x double> %3, <2 x double> %x2 + ret <2 x double> %5 +} + +define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfnmsub_pd_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X0]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> zeroinitializer, [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i64> [[_MSPROP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[X1]], <4 x double> [[TMP2]]) +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP4:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> [[TMP16]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP3]], <4 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x double> [[TMP3]] to <4 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i64> [[TMP8]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i64> [[TMP14]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP4]], <4 x i64> [[TMP15]], <4 x i64> [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP3]], <4 x double> [[X2]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP11]] +; + %1 = fsub <4 x double> , %x0 + %2 = fsub <4 x double> , %x2 + %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %1, <4 x double> %x1, <4 x double> %2) + %4 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <4 x i32> + %5 = select <4 x i1> %extract, <4 x double> %3, <4 x double> %x2 + ret <4 x double> %5 +} + +define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfnmsub_ps_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X0]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> zeroinitializer, [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> [[X1]], <4 x float> [[TMP2]]) +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP4:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> [[TMP16]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP3]], <4 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x float> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x float> [[X2]] to <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i32> [[TMP8]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i32> [[TMP14]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP4]], <4 x i32> [[TMP15]], <4 x i32> [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP3]], <4 x float> [[X2]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP11]] +; + %1 = fsub <4 x float> , %x0 + %2 = fsub <4 x float> , %x2 + %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %1, <4 x float> %x1, <4 x float> %2) + %4 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <4 x i32> + %5 = select <4 x i1> %extract, <4 x float> %3, <4 x float> %x2 + ret <4 x float> %5 +} + +define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfnmsub_ps_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X0]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> zeroinitializer, [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[X1]], <8 x float> [[TMP2]]) +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[_MSPROP3]], <8 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x float> [[TMP3]] to <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x float> [[X2]] to <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i32> [[TMP8]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i32> [[TMP14]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i32> [[TMP15]], <8 x i32> [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP4]], <8 x float> [[TMP3]], <8 x float> [[X2]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP11]] +; + %1 = fsub <8 x float> , %x0 + %2 = fsub <8 x float> , %x2 + %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %1, <8 x float> %x1, <8 x float> %2) + %4 = bitcast i8 %x3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %x2 + ret <8 x float> %5 +} + +define <8 x float> @test_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) #0 { +; CHECK-LABEL: define <8 x float> @test_fmaddsub256_ps( +; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP7]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP7]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[C]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i32> [[_MSPROP4]], <8 x i32> [[_MSPROP1]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP1]], <8 x i32> +; CHECK-NEXT: store <8 x i32> [[_MSPROP5]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP4]] +; + %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) + %2 = fsub <8 x float> , %c + %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %2) + %4 = shufflevector <8 x float> %3, <8 x float> %1, <8 x i32> + ret <8 x float> %4 +} + +define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) #0 { +; +; CHECK-LABEL: define <8 x float> @test_mask_fmaddsub256_ps( +; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP13]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[C]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i32> [[_MSPROP4]], <8 x i32> [[_MSPROP1]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP5]], <8 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x float> [[TMP4]] to <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x float> [[A]] to <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i32> [[TMP9]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP15]], [[TMP6]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> [[TMP16]], <8 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP5]], <8 x float> [[TMP4]], <8 x float> [[A]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP12]] +; + %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) + %2 = fsub <8 x float> , %c + %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %2) + %4 = shufflevector <8 x float> %3, <8 x float> %1, <8 x i32> + %5 = bitcast i8 %mask to <8 x i1> + %6 = select <8 x i1> %5, <8 x float> %4, <8 x float> %a + ret <8 x float> %6 +} + +define <4 x float> @test_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { +; CHECK-LABEL: define <4 x float> @test_fmaddsub128_ps( +; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP7]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[C]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> zeroinitializer, [[TMP7]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[C]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <4 x i32> [[_MSPROP4]], <4 x i32> [[_MSPROP1]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP1]], <4 x i32> +; CHECK-NEXT: store <4 x i32> [[_MSPROP5]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP4]] +; + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) + %2 = fsub <4 x float> , %c + %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %2) + %4 = shufflevector <4 x float> %3, <4 x float> %1, <4 x i32> + ret <4 x float> %4 +} + +define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_mask_fmaddsub128_ps( +; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP13]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[C]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[C]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <4 x i32> [[_MSPROP4]], <4 x i32> [[_MSPROP1]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP5]], <4 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x float> [[TMP4]] to <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i32> [[TMP9]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[TMP6]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP16]], <4 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP4]], <4 x float> [[A]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP12]] +; + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) + %2 = fsub <4 x float> , %c + %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %2) + %4 = shufflevector <4 x float> %3, <4 x float> %1, <4 x i32> + %5 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> + %6 = select <4 x i1> %extract, <4 x float> %4, <4 x float> %a + ret <4 x float> %6 +} + +define <4 x double> @test_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-LABEL: define <4 x double> @test_vfmaddsub256_pd( +; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP7]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[A1]], <4 x double> [[A2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i64> zeroinitializer, [[TMP7]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A2]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i64> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i64> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[A1]], <4 x double> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <4 x i64> [[_MSPROP4]], <4 x i64> [[_MSPROP1]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP1]], <4 x i32> +; CHECK-NEXT: store <4 x i64> [[_MSPROP5]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP4]] +; + %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + %2 = fsub <4 x double> , %a2 + %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %2) + %4 = shufflevector <4 x double> %3, <4 x double> %1, <4 x i32> + ret <4 x double> %4 +} + +define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x double> @test_mask_vfmaddsub256_pd( +; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP13]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[A1]], <4 x double> [[A2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i64> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A2]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i64> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[A1]], <4 x double> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <4 x i64> [[_MSPROP4]], <4 x i64> [[_MSPROP1]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP5]], <4 x i64> [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x double> [[TMP4]] to <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x double> [[A0]] to <4 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i64> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i64> [[TMP9]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i64> [[TMP15]], [[TMP6]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i64> [[TMP16]], <4 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP4]], <4 x double> [[A0]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP12]] +; + %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + %2 = fsub <4 x double> , %a2 + %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %2) + %4 = shufflevector <4 x double> %3, <4 x double> %1, <4 x i32> + %5 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> + %6 = select <4 x i1> %extract, <4 x double> %4, <4 x double> %a0 + ret <4 x double> %6 +} + +define <2 x double> @test_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-LABEL: define <2 x double> @test_vfmaddsub128_pd( +; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP7]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <2 x i64> zeroinitializer, [[TMP7]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A2]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <2 x i64> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <2 x i64> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <2 x i64> [[_MSPROP4]], <2 x i64> [[_MSPROP1]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP1]], <2 x i32> +; CHECK-NEXT: store <2 x i64> [[_MSPROP5]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP4]] +; + %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + %2 = fsub <2 x double> , %a2 + %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %2) + %4 = shufflevector <2 x double> %3, <2 x double> %1, <2 x i32> + ret <2 x double> %4 +} + +define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x double> @test_mask_vfmaddsub128_pd( +; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP13]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <2 x i64> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A2]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <2 x i64> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <2 x i64> [[_MSPROP4]], <2 x i64> [[_MSPROP1]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP5]], <2 x i64> [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x double> [[TMP4]] to <2 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x double> [[A0]] to <2 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = xor <2 x i64> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <2 x i64> [[TMP9]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP16:%.*]] = or <2 x i64> [[TMP15]], [[TMP6]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP6]], <2 x i64> [[TMP16]], <2 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP4]], <2 x double> [[A0]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP12]] +; + %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + %2 = fsub <2 x double> , %a2 + %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %2) + %4 = shufflevector <2 x double> %3, <2 x double> %1, <2 x i32> + %5 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> + %6 = select <2 x i1> %extract, <2 x double> %4, <2 x double> %a0 + ret <2 x double> %6 +} + +define <2 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfmaddsub_pd_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP13]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <2 x i64> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <2 x i64> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <2 x i64> [[_MSPROP4]], <2 x i64> [[_MSPROP1]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP5]], <2 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x double> [[TMP4]] to <2 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x double> [[X2]] to <2 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = xor <2 x i64> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <2 x i64> [[TMP9]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP16:%.*]] = or <2 x i64> [[TMP15]], [[TMP13]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP6]], <2 x i64> [[TMP16]], <2 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP4]], <2 x double> [[X2]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP12]] +; + %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2) + %2 = fsub <2 x double> , %x2 + %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %2) + %4 = shufflevector <2 x double> %3, <2 x double> %1, <2 x i32> + %5 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> + %6 = select <2 x i1> %extract, <2 x double> %4, <2 x double> %x2 + ret <2 x double> %6 +} + +define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_maskz_vfmaddsub_pd_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP13]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <2 x i64> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <2 x i64> [[TMP6]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <2 x i64> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <2 x i64> [[_MSPROP4]], <2 x i64> [[_MSPROP1]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP5]], <2 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x double> [[TMP4]] to <2 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i64> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP10:%.*]] = or <2 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP6]], <2 x i64> [[TMP10]], <2 x i64> [[TMP16]] +; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP4]], <2 x double> zeroinitializer +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP11]] +; + %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2) + %2 = fsub <2 x double> , %x2 + %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %2) + %4 = shufflevector <2 x double> %3, <2 x double> %1, <2 x i32> + %5 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> + %6 = select <2 x i1> %extract, <2 x double> %4, <2 x double> zeroinitializer + ret <2 x double> %6 +} + +define <4 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfmaddsub_pd_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP13]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i64> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i64> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <4 x i64> [[_MSPROP4]], <4 x i64> [[_MSPROP1]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP5]], <4 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x double> [[TMP4]] to <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i64> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i64> [[TMP9]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i64> [[TMP15]], [[TMP13]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i64> [[TMP16]], <4 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP4]], <4 x double> [[X2]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP12]] +; + %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2) + %2 = fsub <4 x double> , %x2 + %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %2) + %4 = shufflevector <4 x double> %3, <4 x double> %1, <4 x i32> + %5 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> + %6 = select <4 x i1> %extract, <4 x double> %4, <4 x double> %x2 + ret <4 x double> %6 +} + +define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_maskz_vfmaddsub_pd_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP13]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i64> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i64> [[TMP6]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i64> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <4 x i64> [[_MSPROP4]], <4 x i64> [[_MSPROP1]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP5]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x double> [[TMP4]] to <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i64> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i64> [[TMP10]], <4 x i64> [[TMP16]] +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP4]], <4 x double> zeroinitializer +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP11]] +; + %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2) + %2 = fsub <4 x double> , %x2 + %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %2) + %4 = shufflevector <4 x double> %3, <4 x double> %1, <4 x i32> + %5 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> + %6 = select <4 x i1> %extract, <4 x double> %4, <4 x double> zeroinitializer + ret <4 x double> %6 +} + +define <4 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfmaddsub_ps_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP13]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <4 x i32> [[_MSPROP4]], <4 x i32> [[_MSPROP1]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP5]], <4 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x float> [[TMP4]] to <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x float> [[X2]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i32> [[TMP9]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[TMP13]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP16]], <4 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP4]], <4 x float> [[X2]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP12]] +; + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) + %2 = fsub <4 x float> , %x2 + %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %2) + %4 = shufflevector <4 x float> %3, <4 x float> %1, <4 x i32> + %5 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> + %6 = select <4 x i1> %extract, <4 x float> %4, <4 x float> %x2 + ret <4 x float> %6 +} + +define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_maskz_vfmaddsub_ps_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP13]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i32> [[TMP6]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <4 x i32> [[_MSPROP4]], <4 x i32> [[_MSPROP1]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP5]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x float> [[TMP4]] to <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP8]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP10]], <4 x i32> [[TMP16]] +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP4]], <4 x float> zeroinitializer +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP11]] +; + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) + %2 = fsub <4 x float> , %x2 + %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %2) + %4 = shufflevector <4 x float> %3, <4 x float> %1, <4 x i32> + %5 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> + %6 = select <4 x i1> %extract, <4 x float> %4, <4 x float> zeroinitializer + ret <4 x float> %6 +} + +define <8 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfmaddsub_ps_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP13]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i32> [[_MSPROP4]], <8 x i32> [[_MSPROP1]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP5]], <8 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x float> [[TMP4]] to <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x float> [[X2]] to <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i32> [[TMP9]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP15]], [[TMP13]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> [[TMP16]], <8 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP5]], <8 x float> [[TMP4]], <8 x float> [[X2]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP12]] +; + %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2) + %2 = fsub <8 x float> , %x2 + %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %2) + %4 = shufflevector <8 x float> %3, <8 x float> %1, <8 x i32> + %5 = bitcast i8 %x3 to <8 x i1> + %6 = select <8 x i1> %5, <8 x float> %4, <8 x float> %x2 + ret <8 x float> %6 +} + +define <8 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_maskz_vfmaddsub_ps_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP13]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP6]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i32> [[_MSPROP4]], <8 x i32> [[_MSPROP1]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP5]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x float> [[TMP4]] to <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP15]], <8 x i32> [[TMP10]], <8 x i32> [[TMP16]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x float> [[TMP4]], <8 x float> zeroinitializer +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP11]] +; + %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2) + %2 = fsub <8 x float> , %x2 + %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %2) + %4 = shufflevector <8 x float> %3, <8 x float> %1, <8 x i32> + %5 = bitcast i8 %x3 to <8 x i1> + %6 = select <8 x i1> %5, <8 x float> %4, <8 x float> zeroinitializer + ret <8 x float> %6 +} + +define <2 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfmsubadd_pd_128( +; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP13]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <2 x i64> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <2 x i64> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <2 x i64> [[_MSPROP1]], <2 x i64> [[_MSPROP4]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP5]], <2 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x double> [[TMP4]] to <2 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x double> [[X2]] to <2 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = xor <2 x i64> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <2 x i64> [[TMP9]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP16:%.*]] = or <2 x i64> [[TMP15]], [[TMP13]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP6]], <2 x i64> [[TMP16]], <2 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP4]], <2 x double> [[X2]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP12]] +; + %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2) + %2 = fsub <2 x double> , %x2 + %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %2) + %4 = shufflevector <2 x double> %1, <2 x double> %3, <2 x i32> + %5 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> + %6 = select <2 x i1> %extract, <2 x double> %4, <2 x double> %x2 + ret <2 x double> %6 +} + +define <4 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfmsubadd_pd_256( +; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP13]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i64> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i64> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <4 x i64> [[_MSPROP1]], <4 x i64> [[_MSPROP4]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP5]], <4 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x double> [[TMP4]] to <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i64> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i64> [[TMP9]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i64> [[TMP15]], [[TMP13]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i64> [[TMP16]], <4 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP4]], <4 x double> [[X2]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP12]] +; + %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2) + %2 = fsub <4 x double> , %x2 + %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %2) + %4 = shufflevector <4 x double> %1, <4 x double> %3, <4 x i32> + %5 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> + %6 = select <4 x i1> %extract, <4 x double> %4, <4 x double> %x2 + ret <4 x double> %6 +} + +define <4 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfmsubadd_ps_128( +; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP13]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <4 x i32> [[_MSPROP1]], <4 x i32> [[_MSPROP4]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP5]], <4 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x float> [[TMP4]] to <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x float> [[X2]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i32> [[TMP9]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[TMP13]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP16]], <4 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP4]], <4 x float> [[X2]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP12]] +; + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) + %2 = fsub <4 x float> , %x2 + %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %2) + %4 = shufflevector <4 x float> %1, <4 x float> %3, <4 x i32> + %5 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> + %6 = select <4 x i1> %extract, <4 x float> %4, <4 x float> %x2 + ret <4 x float> %6 +} + +define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfmsubadd_ps_256( +; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP13]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X2]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[TMP2]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i32> [[_MSPROP1]], <8 x i32> [[_MSPROP4]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP5]], <8 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x float> [[TMP4]] to <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x float> [[X2]] to <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i32> [[TMP9]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP15]], [[TMP13]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> [[TMP16]], <8 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP5]], <8 x float> [[TMP4]], <8 x float> [[X2]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP12]] +; + %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2) + %2 = fsub <8 x float> , %x2 + %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %2) + %4 = shufflevector <8 x float> %1, <8 x float> %3, <8 x i32> + %5 = bitcast i8 %x3 to <8 x i1> + %6 = select <8 x i1> %5, <8 x float> %4, <8 x float> %x2 + ret <8 x float> %6 +} + +define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmk( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[A2:%.*]] = load <4 x float>, ptr [[PTR_A2]], align 16 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_A2]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP15]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP12]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP18:%.*]] = or <4 x i32> [[TMP17]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP18]], <4 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[A0]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP9]] +; + %a2 = load <4 x float>, ptr %ptr_a2 + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %a0 + ret <4 x float> %3 +} + +define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmka( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[A2:%.*]] = load <4 x float>, ptr [[PTR_A2]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_A2]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP15]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP12]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP18:%.*]] = or <4 x i32> [[TMP17]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP18]], <4 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[A0]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP9]] +; + %a2 = load <4 x float>, ptr %ptr_a2, align 8 + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %a0 + ret <4 x float> %3 +} + +define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2) #0 { +; +; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmkz( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[A2:%.*]] = load <4 x float>, ptr [[PTR_A2]], align 16 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_A2]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; + %a2 = load <4 x float>, ptr %ptr_a2 + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %1 +} + +define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2) #0 { +; +; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmkza( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[A2:%.*]] = load <4 x float>, ptr [[PTR_A2]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_A2]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; + %a2 = load <4 x float>, ptr %ptr_a2, align 4 + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %1 +} + +define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmb( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_A2]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_A2]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP15]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1 +; CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[Q]], i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2 +; CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[Q]], i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3 +; CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <4 x float> [[VECINIT4_I]], float [[Q]], i32 3 +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[TMP12]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[VECINIT6_I]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP5]], <4 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP18:%.*]] = or <4 x i32> [[TMP17]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP18]], <4 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[A0]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP9]] +; + %q = load float, ptr %ptr_a2 + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %a0 + ret <4 x float> %3 +} + +define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmba( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_A2]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_A2]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP15]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1 +; CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[Q]], i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2 +; CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[Q]], i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3 +; CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <4 x float> [[VECINIT4_I]], float [[Q]], i32 3 +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[TMP12]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[VECINIT6_I]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP5]], <4 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP18:%.*]] = or <4 x i32> [[TMP17]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP18]], <4 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[A0]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP9]] +; + %q = load float, ptr %ptr_a2, align 4 + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %a0 + ret <4 x float> %3 +} + +define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2) #0 { +; +; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmbz( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_A2]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_A2]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1 +; CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[Q]], i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2 +; CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[Q]], i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3 +; CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <4 x float> [[VECINIT4_I]], float [[Q]], i32 3 +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[VECINIT6_I]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP5]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; + %q = load float, ptr %ptr_a2 + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i) + ret <4 x float> %1 +} + +define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2) #0 { +; +; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmbza( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_A2]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_A2]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1 +; CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[Q]], i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2 +; CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[Q]], i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3 +; CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <4 x float> [[VECINIT4_I]], float [[Q]], i32 3 +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[VECINIT6_I]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP5]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; + %q = load float, ptr %ptr_a2, align 4 + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i) + ret <4 x float> %1 +} + +define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, ptr %ptr_a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <2 x double> @test_mask_vfmadd128_pd_rmk( +; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[A2:%.*]] = load <2 x double>, ptr [[PTR_A2]], align 16 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_A2]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP15]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP12]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <2 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP1]], <2 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x double> [[TMP1]] to <2 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x double> [[A0]] to <2 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i64> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP17:%.*]] = or <2 x i64> [[TMP6]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP18:%.*]] = or <2 x i64> [[TMP17]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP2]], <2 x i64> [[TMP18]], <2 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP1]], <2 x double> [[A0]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP9]] +; + %a2 = load <2 x double>, ptr %ptr_a2 + %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> + %3 = select <2 x i1> %extract, <2 x double> %1, <2 x double> %a0 + ret <2 x double> %3 +} + +define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double> %a1, ptr %ptr_a2) #0 { +; +; CHECK-LABEL: define <2 x double> @test_mask_vfmadd128_pd_rmkz( +; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[A2:%.*]] = load <2 x double>, ptr [[PTR_A2]], align 16 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_A2]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP8]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]]) +; CHECK-NEXT: store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP1]] +; + %a2 = load <2 x double>, ptr %ptr_a2 + %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %1 +} + +define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, ptr %ptr_a2, i8 %mask) #0 { +; +; CHECK-LABEL: define <4 x double> @test_mask_vfmadd256_pd_rmk( +; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[A2:%.*]] = load <4 x double>, ptr [[PTR_A2]], align 32 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_A2]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i64>, ptr [[TMP15]], align 32 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP12]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[A1]], <4 x double> [[A2]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP14]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP1]], <4 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x double> [[A0]] to <4 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i64> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i64> [[TMP6]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP18:%.*]] = or <4 x i64> [[TMP17]], [[TMP12]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i64> [[TMP18]], <4 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[A0]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP9]] +; + %a2 = load <4 x double>, ptr %ptr_a2 + %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %a0 + ret <4 x double> %3 +} + +define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> %a1, ptr %ptr_a2) #0 { +; +; CHECK-LABEL: define <4 x double> @test_mask_vfmadd256_pd_rmkz( +; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[A2:%.*]] = load <4 x double>, ptr [[PTR_A2]], align 32 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_A2]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 32 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[A1]], <4 x double> [[A2]]) +; CHECK-NEXT: store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP1]] +; + %a2 = load <4 x double>, ptr %ptr_a2 + %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %1 +} + +define <8 x i32> @combine_vpermi2d_vpermps(<16 x i32> noundef %a) #0 { +; +; CHECK-LABEL: define <8 x i32> @combine_vpermi2d_vpermps( +; CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> splat (i32 -1), <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> splat (i32 -1), <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP1]], <8 x i32> , <8 x i32> [[TMP2]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP4]] +; + %1 = shufflevector <16 x i32> %a, <16 x i32> poison, <8 x i32> + %2 = shufflevector <16 x i32> %a, <16 x i32> poison, <8 x i32> + %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %1, <8 x i32> , <8 x i32> %2) + ret <8 x i32> %3 +} + +declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) +declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) +declare <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double>, <2 x double>, <2 x i1>) +declare <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float>, <4 x float>, <4 x i1>) +declare <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64>, <2 x i64>, <2 x i1>) +declare <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32>, <4 x i32>, <4 x i1>) +declare <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double>, <2 x double>, <2 x i1>) +declare <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float>, <4 x float>, <4 x i1>) +declare <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64>, <2 x i64>, <2 x i1>) +declare <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32>, <4 x i32>, <4 x i1>) +declare <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double>, <4 x double>, <4 x i1>) +declare <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float>, <8 x float>, <8 x i1>) +declare <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64>, <4 x i64>, <4 x i1>) +declare <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32>, <8 x i32>, <8 x i1>) +declare <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double>, <4 x double>, <4 x i1>) +declare <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float>, <8 x float>, <8 x i1>) +declare <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64>, <4 x i64>, <4 x i1>) +declare <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32>, <8 x i32>, <8 x i1>) + +attributes #0 = { sanitize_memory } +;. +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll new file mode 100644 index 0000000000000..2350d75b29b44 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll @@ -0,0 +1,722 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mattr=+avx512f -passes=msan -S | FileCheck %s + +; Forked from llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; +; vXi64 +; + +define <2 x i64> @shuffle_vpermv3_v2i64(<2 x i64> %x0, <2 x i64> %x1) #0 { +; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP2]] +; CHECK-NEXT: [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> , <2 x i64> [[X1]]) +; CHECK-NEXT: store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[R]] +; + %r = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> , <2 x i64> %x1) + ret <2 x i64> %r +} + +define <2 x i64> @shuffle_vpermv3_v2i64_unary(<2 x i64> %x0) #0 { +; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64_unary( +; CHECK-SAME: <2 x i64> [[X0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> , <2 x i64> [[X0]]) +; CHECK-NEXT: store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[R]] +; + %r = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> , <2 x i64> %x0) + ret <2 x i64> %r +} + +define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %m) #0 { +; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[M:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i64> [[M]], splat (i64 -1) +; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = and <2 x i64> [[TMP4]], +; CHECK-NEXT: [[TMP8:%.*]] = or <2 x i64> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[T:%.*]] = or <2 x i64> [[M]], +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[T]], <2 x i64> [[X1]]) +; CHECK-NEXT: store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[R]] +; + %t = or <2 x i64> %m, + %r = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %t, <2 x i64> %x1) + ret <2 x i64> %r +} + +define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits_negative(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %m) #0 { +; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits_negative( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[M:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i64> [[M]], splat (i64 -1) +; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = and <2 x i64> [[TMP4]], +; CHECK-NEXT: [[TMP8:%.*]] = or <2 x i64> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[T:%.*]] = or <2 x i64> [[M]], +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[T]], <2 x i64> [[X1]]) +; CHECK-NEXT: store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[R]] +; + %t = or <2 x i64> %m, + %r = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %t, <2 x i64> %x1) + ret <2 x i64> %r +} + +define <4 x i64> @shuffle_vpermv3_v4i64(<4 x i64> %x0, <4 x i64> %x1) #0 { +; CHECK-LABEL: define <4 x i64> @shuffle_vpermv3_v4i64( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP2]] +; CHECK-NEXT: [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> , <4 x i64> [[X1]]) +; CHECK-NEXT: store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[R]] +; + %r = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> , <4 x i64> %x1) + ret <4 x i64> %r +} + +define <4 x i64> @shuffle_vpermv3_v4i64_unary(<4 x i64> %x0) #0 { +; CHECK-LABEL: define <4 x i64> @shuffle_vpermv3_v4i64_unary( +; CHECK-SAME: <4 x i64> [[X0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> , <4 x i64> [[X0]]) +; CHECK-NEXT: store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[R]] +; + %r = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> , <4 x i64> %x0) + ret <4 x i64> %r +} + +define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %m) #0 { +; CHECK-LABEL: define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[M:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i64> [[M]], splat (i64 -1) +; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i64> [[TMP4]], +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i64> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[T:%.*]] = or <4 x i64> [[M]], +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[T]], <4 x i64> [[X1]]) +; CHECK-NEXT: store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[R]] +; + %t = or <4 x i64> %m, + %r = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %t, <4 x i64> %x1) + ret <4 x i64> %r +} + +define <8 x i64> @shuffle_vpermv3_v8i64(<8 x i64> %x0, <8 x i64> %x1) #0 { +; CHECK-LABEL: define <8 x i64> @shuffle_vpermv3_v8i64( +; CHECK-SAME: <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP2]] +; CHECK-NEXT: [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> , <8 x i64> [[X1]]) +; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[R]] +; + %r = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1) + ret <8 x i64> %r +} + +define <8 x i64> @shuffle_vpermv3_v8i64_unary(<8 x i64> %x0) #0 { +; CHECK-LABEL: define <8 x i64> @shuffle_vpermv3_v8i64_unary( +; CHECK-SAME: <8 x i64> [[X0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> , <8 x i64> [[X0]]) +; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[R]] +; + %r = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x0) + ret <8 x i64> %r +} + +define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %m) #0 { +; CHECK-LABEL: define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits( +; CHECK-SAME: <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[M:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = xor <8 x i64> [[M]], splat (i64 -1) +; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = and <8 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i64> [[TMP4]], +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[T:%.*]] = or <8 x i64> [[M]], +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> [[T]], <8 x i64> [[X1]]) +; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[R]] +; + %t = or <8 x i64> %m, + %r = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %t, <8 x i64> %x1) + ret <8 x i64> %r +} + +; +; vXi32 +; + +define <4 x i32> @shuffle_vpermv3_v4i32(<4 x i32> %x0, <4 x i32> %x1) #0 { +; CHECK-LABEL: define <4 x i32> @shuffle_vpermv3_v4i32( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP2]] +; CHECK-NEXT: [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> , <4 x i32> [[X1]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[R]] +; + %r = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> , <4 x i32> %x1) + ret <4 x i32> %r +} + +define <4 x i32> @shuffle_vpermv3_v4i32_unary(<4 x i32> %x0) #0 { +; CHECK-LABEL: define <4 x i32> @shuffle_vpermv3_v4i32_unary( +; CHECK-SAME: <4 x i32> [[X0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> , <4 x i32> [[X0]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[R]] +; + %r = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> , <4 x i32> %x0) + ret <4 x i32> %r +} + +define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %m) #0 { +; CHECK-LABEL: define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[M:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i32> [[M]], splat (i32 -1) +; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[T:%.*]] = or <4 x i32> [[M]], +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[T]], <4 x i32> [[X1]]) +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[R]] +; + %t = or <4 x i32> %m, + %r = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %t, <4 x i32> %x1) + ret <4 x i32> %r +} + +define <8 x i32> @shuffle_vpermv3_v8i32(<8 x i32> %x0, <8 x i32> %x1) #0 { +; CHECK-LABEL: define <8 x i32> @shuffle_vpermv3_v8i32( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP2]] +; CHECK-NEXT: [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> , <8 x i32> [[X1]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[R]] +; + %r = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> , <8 x i32> %x1) + ret <8 x i32> %r +} + +define <8 x i32> @shuffle_vpermv3_v8i32_unary(<8 x i32> %x0) #0 { +; CHECK-LABEL: define <8 x i32> @shuffle_vpermv3_v8i32_unary( +; CHECK-SAME: <8 x i32> [[X0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> , <8 x i32> [[X0]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[R]] +; + %r = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> , <8 x i32> %x0) + ret <8 x i32> %r +} + +define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %m) #0 { +; CHECK-LABEL: define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[M:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = xor <8 x i32> [[M]], splat (i32 -1) +; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = and <8 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[T:%.*]] = or <8 x i32> [[M]], +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[T]], <8 x i32> [[X1]]) +; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[R]] +; + %t = or <8 x i32> %m, + %r = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %t, <8 x i32> %x1) + ret <8 x i32> %r +} + +define <16 x i32> @shuffle_vpermv3_v16i32(<16 x i32> %x0, <16 x i32> %x1) #0 { +; CHECK-LABEL: define <16 x i32> @shuffle_vpermv3_v16i32( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP2]] +; CHECK-NEXT: [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> , <16 x i32> [[X1]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[R]] +; + %r = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1) + ret <16 x i32> %r +} + +define <16 x i32> @shuffle_vpermv3_v16i32_unary(<16 x i32> %x0) #0 { +; CHECK-LABEL: define <16 x i32> @shuffle_vpermv3_v16i32_unary( +; CHECK-SAME: <16 x i32> [[X0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> , <16 x i32> [[X0]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[R]] +; + %r = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x0) + ret <16 x i32> %r +} + +define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %m) #0 { +; CHECK-LABEL: define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[M:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = xor <16 x i32> [[M]], splat (i32 -1) +; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = and <16 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[T:%.*]] = or <16 x i32> [[M]], +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> [[T]], <16 x i32> [[X1]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[R]] +; + %t = or <16 x i32> %m, + %r = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %t, <16 x i32> %x1) + ret <16 x i32> %r +} + +; +; vXi16 +; + +define <8 x i16> @shuffle_vpermv3_v8i16(<8 x i16> %x0, <8 x i16> %x1) #0 { +; CHECK-LABEL: define <8 x i16> @shuffle_vpermv3_v8i16( +; CHECK-SAME: <8 x i16> [[X0:%.*]], <8 x i16> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[TMP2]] +; CHECK-NEXT: [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> , <8 x i16> [[X1]]) +; CHECK-NEXT: store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[R]] +; + %r = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> , <8 x i16> %x1) + ret <8 x i16> %r +} + +define <8 x i16> @shuffle_vpermv3_v8i16_unary(<8 x i16> %x0) #0 { +; CHECK-LABEL: define <8 x i16> @shuffle_vpermv3_v8i16_unary( +; CHECK-SAME: <8 x i16> [[X0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> , <8 x i16> [[X0]]) +; CHECK-NEXT: store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[R]] +; + %r = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> , <8 x i16> %x0) + ret <8 x i16> %r +} + +define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %m) #0 { +; CHECK-LABEL: define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits( +; CHECK-SAME: <8 x i16> [[X0:%.*]], <8 x i16> [[X1:%.*]], <8 x i16> [[M:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = xor <8 x i16> [[M]], splat (i16 -1) +; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i16> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = and <8 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i16> [[TMP4]], +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i16> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i16> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[T:%.*]] = or <8 x i16> [[M]], +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> [[T]], <8 x i16> [[X1]]) +; CHECK-NEXT: store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[R]] +; + %t = or <8 x i16> %m, + %r = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %t, <8 x i16> %x1) + ret <8 x i16> %r +} + +define <16 x i16> @shuffle_vpermv3_v16i16(<16 x i16> %x0, <16 x i16> %x1) #0 { +; CHECK-LABEL: define <16 x i16> @shuffle_vpermv3_v16i16( +; CHECK-SAME: <16 x i16> [[X0:%.*]], <16 x i16> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i16> [[_MSPROP]], [[TMP2]] +; CHECK-NEXT: [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> , <16 x i16> [[X1]]) +; CHECK-NEXT: store <16 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i16> [[R]] +; + %r = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> , <16 x i16> %x1) + ret <16 x i16> %r +} + +define <16 x i16> @shuffle_vpermv3_v16i16_unary(<16 x i16> %x0) #0 { +; CHECK-LABEL: define <16 x i16> @shuffle_vpermv3_v16i16_unary( +; CHECK-SAME: <16 x i16> [[X0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i16> [[_MSPROP]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> , <16 x i16> [[X0]]) +; CHECK-NEXT: store <16 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i16> [[R]] +; + %r = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> , <16 x i16> %x0) + ret <16 x i16> %r +} + +define <16 x i16> @shuffle_vpermv3_v16i16_demandedbits(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %m) #0 { +; CHECK-LABEL: define <16 x i16> @shuffle_vpermv3_v16i16_demandedbits( +; CHECK-SAME: <16 x i16> [[X0:%.*]], <16 x i16> [[X1:%.*]], <16 x i16> [[M:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = xor <16 x i16> [[M]], splat (i16 -1) +; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i16> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = and <16 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i16> [[TMP4]], +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i16> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i16> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[T:%.*]] = or <16 x i16> [[M]], +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i16> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i16> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> [[T]], <16 x i16> [[X1]]) +; CHECK-NEXT: store <16 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i16> [[R]] +; + %t = or <16 x i16> %m, + %r = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %t, <16 x i16> %x1) + ret <16 x i16> %r +} + +define <32 x i16> @shuffle_vpermv3_v32i16(<32 x i16> %x0, <32 x i16> %x1) #0 { +; CHECK-LABEL: define <32 x i16> @shuffle_vpermv3_v32i16( +; CHECK-SAME: <32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], [[TMP2]] +; CHECK-NEXT: [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> , <32 x i16> [[X1]]) +; CHECK-NEXT: store <32 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <32 x i16> [[R]] +; + %r = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> , <32 x i16> %x1) + ret <32 x i16> %r +} + +define <32 x i16> @shuffle_vpermv3_v32i16_unary(<32 x i16> %x0) #0 { +; CHECK-LABEL: define <32 x i16> @shuffle_vpermv3_v32i16_unary( +; CHECK-SAME: <32 x i16> [[X0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> , <32 x i16> [[X0]]) +; CHECK-NEXT: store <32 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <32 x i16> [[R]] +; + %r = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> , <32 x i16> %x0) + ret <32 x i16> %r +} + +define <32 x i16> @shuffle_vpermv3_v32i16_demandedbits(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %m) #0 { +; CHECK-LABEL: define <32 x i16> @shuffle_vpermv3_v32i16_demandedbits( +; CHECK-SAME: <32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]], <32 x i16> [[M:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = xor <32 x i16> [[M]], splat (i16 -1) +; CHECK-NEXT: [[TMP5:%.*]] = and <32 x i16> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = and <32 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = and <32 x i16> [[TMP4]], +; CHECK-NEXT: [[TMP8:%.*]] = or <32 x i16> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = or <32 x i16> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[T:%.*]] = or <32 x i16> [[M]], +; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i16> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> [[T]], <32 x i16> [[X1]]) +; CHECK-NEXT: store <32 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <32 x i16> [[R]] +; + %t = or <32 x i16> %m, + %r = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %t, <32 x i16> %x1) + ret <32 x i16> %r +} + +; +; vXi8 +; + +define <16 x i8> @shuffle_vpermv3_v16i8(<16 x i8> %x0, <16 x i8> %x1) #0 { +; CHECK-LABEL: define <16 x i8> @shuffle_vpermv3_v16i8( +; CHECK-SAME: <16 x i8> [[X0:%.*]], <16 x i8> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP2]] +; CHECK-NEXT: [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> , <16 x i8> [[X1]]) +; CHECK-NEXT: store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[R]] +; + %r = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> , <16 x i8> %x1) + ret <16 x i8> %r +} + +define <16 x i8> @shuffle_vpermv3_v16i8_unary(<16 x i8> %x0) #0 { +; CHECK-LABEL: define <16 x i8> @shuffle_vpermv3_v16i8_unary( +; CHECK-SAME: <16 x i8> [[X0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> , <16 x i8> [[X0]]) +; CHECK-NEXT: store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[R]] +; + %r = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> , <16 x i8> %x0) + ret <16 x i8> %r +} + +define <16 x i8> @shuffle_vpermv3_v16i8_demandedbits(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %m) #0 { +; CHECK-LABEL: define <16 x i8> @shuffle_vpermv3_v16i8_demandedbits( +; CHECK-SAME: <16 x i8> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[M:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = xor <16 x i8> [[M]], splat (i8 -1) +; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i8> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = and <16 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i8> [[TMP4]], +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i8> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i8> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[T:%.*]] = or <16 x i8> [[M]], +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> [[T]], <16 x i8> [[X1]]) +; CHECK-NEXT: store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[R]] +; + %t = or <16 x i8> %m, + %r = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %t, <16 x i8> %x1) + ret <16 x i8> %r +} + +define <32 x i8> @shuffle_vpermv3_v32i8(<32 x i8> %x0, <32 x i8> %x1) #0 { +; CHECK-LABEL: define <32 x i8> @shuffle_vpermv3_v32i8( +; CHECK-SAME: <32 x i8> [[X0:%.*]], <32 x i8> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP2]] +; CHECK-NEXT: [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> , <32 x i8> [[X1]]) +; CHECK-NEXT: store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <32 x i8> [[R]] +; + %r = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> , <32 x i8> %x1) + ret <32 x i8> %r +} + +define <32 x i8> @shuffle_vpermv3_v32i8_unary(<32 x i8> %x0) #0 { +; CHECK-LABEL: define <32 x i8> @shuffle_vpermv3_v32i8_unary( +; CHECK-SAME: <32 x i8> [[X0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> , <32 x i8> [[X0]]) +; CHECK-NEXT: store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <32 x i8> [[R]] +; + %r = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> , <32 x i8> %x0) + ret <32 x i8> %r +} + +define <32 x i8> @shuffle_vpermv3_v32i8_demandedbits(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %m) #0 { +; CHECK-LABEL: define <32 x i8> @shuffle_vpermv3_v32i8_demandedbits( +; CHECK-SAME: <32 x i8> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[M:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = xor <32 x i8> [[M]], splat (i8 -1) +; CHECK-NEXT: [[TMP5:%.*]] = and <32 x i8> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = and <32 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = and <32 x i8> [[TMP4]], +; CHECK-NEXT: [[TMP8:%.*]] = or <32 x i8> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = or <32 x i8> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[T:%.*]] = or <32 x i8> [[M]], +; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i8> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> [[T]], <32 x i8> [[X1]]) +; CHECK-NEXT: store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <32 x i8> [[R]] +; + %t = or <32 x i8> %m, + %r = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %t, <32 x i8> %x1) + ret <32 x i8> %r +} + +define <64 x i8> @shuffle_vpermv3_v64i8(<64 x i8> %x0, <64 x i8> %x1) #0 { +; CHECK-LABEL: define <64 x i8> @shuffle_vpermv3_v64i8( +; CHECK-SAME: <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <64 x i8> [[_MSPROP]], [[TMP2]] +; CHECK-NEXT: [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> , <64 x i8> [[X1]]) +; CHECK-NEXT: store <64 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <64 x i8> [[R]] +; + %r = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> , <64 x i8> %x1) + ret <64 x i8> %r +} + +define <64 x i8> @shuffle_vpermv3_v64i8_unary(<64 x i8> %x0) #0 { +; CHECK-LABEL: define <64 x i8> @shuffle_vpermv3_v64i8_unary( +; CHECK-SAME: <64 x i8> [[X0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <64 x i8> [[_MSPROP]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> , <64 x i8> [[X0]]) +; CHECK-NEXT: store <64 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <64 x i8> [[R]] +; + %r = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> , <64 x i8> %x0) + ret <64 x i8> %r +} + +define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %m) #0 { +; CHECK-LABEL: define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits( +; CHECK-SAME: <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]], <64 x i8> [[M:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = xor <64 x i8> [[M]], splat (i8 -1) +; CHECK-NEXT: [[TMP5:%.*]] = and <64 x i8> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = and <64 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = and <64 x i8> [[TMP4]], +; CHECK-NEXT: [[TMP8:%.*]] = or <64 x i8> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = or <64 x i8> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[T:%.*]] = or <64 x i8> [[M]], +; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <64 x i8> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> [[T]], <64 x i8> [[X1]]) +; CHECK-NEXT: store <64 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <64 x i8> [[R]] +; + %t = or <64 x i8> %m, + %r = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %t, <64 x i8> %x1) + ret <64 x i8> %r +} + +attributes #0 = { sanitize_memory }