diff --git a/clang/test/CodeGen/X86/avx-cast-builtins.c b/clang/test/CodeGen/X86/avx-cast-builtins.c index 8b941c4287b9a..4dcd371471f4a 100644 --- a/clang/test/CodeGen/X86/avx-cast-builtins.c +++ b/clang/test/CodeGen/X86/avx-cast-builtins.c @@ -1,101 +1,230 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // REQUIRES: x86-registered-target -// RUN: %clang_cc1 -O3 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +avx -target-feature +avx512f -target-feature +avx512fp16 -S -o - | FileCheck %s +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +avx -target-feature +avx512f -target-feature +avx512fp16 -emit-llvm -o - | FileCheck %s #include +// CHECK-LABEL: define dso_local <4 x double> @test_mm256_castpd128_pd256( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = freeze <2 x double> poison +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> +// CHECK-NEXT: ret <4 x double> [[SHUFFLE_I]] +// __m256d test_mm256_castpd128_pd256(__m128d A) { - // CHECK-LABEL: test_mm256_castpd128_pd256 - // CHECK: # %bb.0: - // CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 - // CHECK-NEXT: ret{{[l|q]}} return _mm256_castpd128_pd256(A); } +// CHECK-LABEL: define dso_local <8 x float> @test_mm256_castps128_ps256( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = freeze <4 x float> poison +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> +// CHECK-NEXT: ret <8 x float> [[SHUFFLE_I]] +// __m256 test_mm256_castps128_ps256(__m128 A) { - // CHECK-LABEL: test_mm256_castps128_ps256 - // CHECK: # %bb.0: - // CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 - // CHECK-NEXT: ret{{[l|q]}} return _mm256_castps128_ps256(A); } +// CHECK-LABEL: define dso_local <4 x i64> @test_mm256_castsi128_si256( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: store <2 x i64> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = freeze <2 x i64> poison +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> +// CHECK-NEXT: ret <4 x i64> [[SHUFFLE_I]] +// __m256i test_mm256_castsi128_si256(__m128i A) { - // CHECK-LABEL: test_mm256_castsi128_si256 - // CHECK: # %bb.0: - // CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 - // CHECK-NEXT: ret{{[l|q]}} return _mm256_castsi128_si256(A); } +// CHECK-LABEL: define dso_local <16 x half> @test_mm256_castph128_ph256( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: store <8 x half> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = freeze <8 x half> poison +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP2]], <16 x i32> +// CHECK-NEXT: ret <16 x half> [[SHUFFLE_I]] +// __m256h test_mm256_castph128_ph256(__m128h A) { - // CHECK-LABEL: test_mm256_castph128_ph256 - // CHECK: # %bb.0: - // CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 - // CHECK-NEXT: ret{{[l|q]}} return _mm256_castph128_ph256(A); } +// CHECK-LABEL: define dso_local <32 x half> @test_mm512_castph128_ph512( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__B_I:%.*]] = alloca <16 x half>, align 32 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: store <8 x half> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = freeze <16 x half> poison +// CHECK-NEXT: store <16 x half> [[TMP1]], ptr [[__B_I]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = freeze <8 x half> poison +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP3]], <16 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = load <16 x half>, ptr [[__B_I]], align 32 +// CHECK-NEXT: [[SHUFFLE1_I:%.*]] = shufflevector <16 x half> [[SHUFFLE_I]], <16 x half> [[TMP4]], <32 x i32> +// CHECK-NEXT: ret <32 x half> [[SHUFFLE1_I]] +// __m512h test_mm512_castph128_ph512(__m128h A) { - // CHECK-LABEL: test_mm512_castph128_ph512 - // CHECK: # %bb.0: - // CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 - // CHECK-NEXT: ret{{[l|q]}} return _mm512_castph128_ph512(A); } +// CHECK-LABEL: define dso_local <32 x half> @test_mm512_castph256_ph512( +// CHECK-SAME: <16 x half> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <16 x half>, align 32 +// CHECK-NEXT: store <16 x half> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <16 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = freeze <16 x half> poison +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x half> [[TMP1]], <16 x half> [[TMP2]], <32 x i32> +// CHECK-NEXT: ret <32 x half> [[SHUFFLE_I]] +// __m512h test_mm512_castph256_ph512(__m256h A) { - // CHECK-LABEL: test_mm512_castph256_ph512 - // CHECK: # %bb.0: - // CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 - // CHECK-NEXT: ret{{[l|q]}} return _mm512_castph256_ph512(A); } +// CHECK-LABEL: define dso_local <8 x double> @test_mm512_castpd256_pd512( +// CHECK-SAME: <4 x double> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x double>, align 32 +// CHECK-NEXT: store <4 x double> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = freeze <4 x double> poison +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP2]], <8 x i32> +// CHECK-NEXT: ret <8 x double> [[SHUFFLE_I]] +// __m512d test_mm512_castpd256_pd512(__m256d A){ - // CHECK-LABEL: test_mm512_castpd256_pd512 - // CHECK: # %bb.0: - // CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 - // CHECK-NEXT: ret{{[l|q]}} return _mm512_castpd256_pd512(A); } +// CHECK-LABEL: define dso_local <16 x float> @test_mm512_castps256_ps512( +// CHECK-SAME: <8 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: store <8 x float> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = freeze <8 x float> poison +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <16 x i32> +// CHECK-NEXT: ret <16 x float> [[SHUFFLE_I]] +// __m512 test_mm512_castps256_ps512(__m256 A){ - // CHECK-LABEL: test_mm512_castps256_ps512 - // CHECK: # %bb.0: - // CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 - // CHECK-NEXT: ret{{[l|q]}} return _mm512_castps256_ps512(A); } +// CHECK-LABEL: define dso_local <8 x double> @test_mm512_castpd128_pd512( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__B_I:%.*]] = alloca <4 x double>, align 32 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = freeze <4 x double> poison +// CHECK-NEXT: store <4 x double> [[TMP1]], ptr [[__B_I]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = freeze <2 x double> poison +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x double>, ptr [[__B_I]], align 32 +// CHECK-NEXT: [[SHUFFLE1_I:%.*]] = shufflevector <4 x double> [[SHUFFLE_I]], <4 x double> [[TMP4]], <8 x i32> +// CHECK-NEXT: ret <8 x double> [[SHUFFLE1_I]] +// __m512d test_mm512_castpd128_pd512(__m128d A){ - // CHECK-LABEL: test_mm512_castpd128_pd512 - // CHECK: # %bb.0: - // CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 - // CHECK-NEXT: ret{{[l|q]}} return _mm512_castpd128_pd512(A); } +// CHECK-LABEL: define dso_local <16 x float> @test_mm512_castps128_ps512( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[__B_I:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = freeze <8 x float> poison +// CHECK-NEXT: store <8 x float> [[TMP1]], ptr [[__B_I]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x float> poison +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = load <8 x float>, ptr [[__B_I]], align 32 +// CHECK-NEXT: [[SHUFFLE1_I:%.*]] = shufflevector <8 x float> [[SHUFFLE_I]], <8 x float> [[TMP4]], <16 x i32> +// CHECK-NEXT: ret <16 x float> [[SHUFFLE1_I]] +// __m512 test_mm512_castps128_ps512(__m128 A){ - // CHECK-LABEL: test_mm512_castps128_ps512 - // CHECK: # %bb.0: - // CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 - // CHECK-NEXT: ret{{[l|q]}} return _mm512_castps128_ps512(A); } +// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_castsi128_si512( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[__B_I:%.*]] = alloca <4 x i64>, align 32 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: store <2 x i64> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = freeze <4 x i64> poison +// CHECK-NEXT: store <4 x i64> [[TMP1]], ptr [[__B_I]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = freeze <2 x i64> poison +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr [[__B_I]], align 32 +// CHECK-NEXT: [[SHUFFLE1_I:%.*]] = shufflevector <4 x i64> [[SHUFFLE_I]], <4 x i64> [[TMP4]], <8 x i32> +// CHECK-NEXT: ret <8 x i64> [[SHUFFLE1_I]] +// __m512i test_mm512_castsi128_si512(__m128i A){ - // CHECK-LABEL: test_mm512_castsi128_si512 - // CHECK: # %bb.0: - // CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 - // CHECK-NEXT: ret{{[l|q]}} return _mm512_castsi128_si512(A); } +// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_castsi256_si512( +// CHECK-SAME: <4 x i64> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <4 x i64>, align 32 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x i64>, align 32 +// CHECK-NEXT: store <4 x i64> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <4 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[__A_ADDR_I]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = freeze <4 x i64> poison +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <8 x i32> +// CHECK-NEXT: ret <8 x i64> [[SHUFFLE_I]] +// __m512i test_mm512_castsi256_si512(__m256i A){ - // CHECK-LABEL: test_mm512_castsi256_si512 - // CHECK: # %bb.0: - // CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 - // CHECK-NEXT: ret{{[l|q]}} return _mm512_castsi256_si512(A); } diff --git a/clang/test/CodeGen/X86/avx-cmp-builtins.c b/clang/test/CodeGen/X86/avx-cmp-builtins.c index c4e3c7ccd5498..f98cf041c05ea 100644 --- a/clang/test/CodeGen/X86/avx-cmp-builtins.c +++ b/clang/test/CodeGen/X86/avx-cmp-builtins.c @@ -1,5 +1,6 @@ -// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -O3 -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -O3 -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-- -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-X32 +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i386-- -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-X64 // FIXME: The shufflevector instructions in test_cmpgt_sd are relying on O3 here. @@ -9,62 +10,694 @@ // Test LLVM IR codegen of cmpXY instructions // +// CHECK-LABEL: define dso_local <2 x double> @test_cmp_sd( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP0]], <2 x double> [[TMP1]], i8 13) +// CHECK-NEXT: ret <2 x double> [[TMP2]] +// CHECK-X32-LABEL: define dso_local <2 x double> @test_cmp_sd( +// CHECK-X32-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-X32-NEXT: [[ENTRY:.*:]] +// CHECK-X32-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP0]], <2 x double> [[TMP1]], i8 13) +// CHECK-X32-NEXT: ret <2 x double> [[TMP2]] +// +// CHECK-X64-LABEL: define dso_local <2 x double> @test_cmp_sd( +// CHECK-X64-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-X64-NEXT: [[ENTRY:.*:]] +// CHECK-X64-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP0]], <2 x double> [[TMP1]], i8 13) +// CHECK-X64-NEXT: ret <2 x double> [[TMP2]] +// __m128d test_cmp_sd(__m128d a, __m128d b) { // Expects that the third argument in LLVM IR is immediate expression - // CHECK: @llvm.x86.sse2.cmp.sd({{.*}}, i8 13) return _mm_cmp_sd(a, b, _CMP_GE_OS); } +// CHECK-LABEL: define dso_local <4 x float> @test_cmp_ss( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP0]], <4 x float> [[TMP1]], i8 13) +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// CHECK-X32-LABEL: define dso_local <4 x float> @test_cmp_ss( +// CHECK-X32-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-X32-NEXT: [[ENTRY:.*:]] +// CHECK-X32-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X32-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X32-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP0]], <4 x float> [[TMP1]], i8 13) +// CHECK-X32-NEXT: ret <4 x float> [[TMP2]] +// +// CHECK-X64-LABEL: define dso_local <4 x float> @test_cmp_ss( +// CHECK-X64-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-X64-NEXT: [[ENTRY:.*:]] +// CHECK-X64-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X64-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X64-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP0]], <4 x float> [[TMP1]], i8 13) +// CHECK-X64-NEXT: ret <4 x float> [[TMP2]] +// __m128 test_cmp_ss(__m128 a, __m128 b) { // Expects that the third argument in LLVM IR is immediate expression - // CHECK: @llvm.x86.sse.cmp.ss({{.*}}, i8 13) return _mm_cmp_ss(a, b, _CMP_GE_OS); } +// CHECK-LABEL: define dso_local <4 x float> @test_cmpgt_ss( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP3]], <4 x float> [[TMP4]], i8 1) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// CHECK-X32-LABEL: define dso_local <4 x float> @test_cmpgt_ss( +// CHECK-X32-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-X32-NEXT: [[ENTRY:.*:]] +// CHECK-X32-NEXT: [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-X32-NEXT: [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-X32-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X32-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X32-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: store <4 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP3]], <4 x float> [[TMP4]], i8 1) +// CHECK-X32-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-X32-NEXT: ret <4 x float> [[SHUFFLE_I]] +// +// CHECK-X64-LABEL: define dso_local <4 x float> @test_cmpgt_ss( +// CHECK-X64-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-X64-NEXT: [[ENTRY:.*:]] +// CHECK-X64-NEXT: [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-X64-NEXT: [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-X64-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X64-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X64-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: store <4 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP3]], <4 x float> [[TMP4]], i8 1) +// CHECK-X64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-X64-NEXT: ret <4 x float> [[SHUFFLE_I]] +// __m128 test_cmpgt_ss(__m128 a, __m128 b) { - // CHECK: @llvm.x86.sse.cmp.ss({{.*}}, i8 1) - // CHECK: shufflevector <{{.*}}, <4 x i32> return _mm_cmpgt_ss(a, b); } +// CHECK-LABEL: define dso_local <4 x float> @test_cmpge_ss( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP3]], <4 x float> [[TMP4]], i8 2) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// CHECK-X32-LABEL: define dso_local <4 x float> @test_cmpge_ss( +// CHECK-X32-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-X32-NEXT: [[ENTRY:.*:]] +// CHECK-X32-NEXT: [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-X32-NEXT: [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-X32-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X32-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X32-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: store <4 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP3]], <4 x float> [[TMP4]], i8 2) +// CHECK-X32-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-X32-NEXT: ret <4 x float> [[SHUFFLE_I]] +// +// CHECK-X64-LABEL: define dso_local <4 x float> @test_cmpge_ss( +// CHECK-X64-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-X64-NEXT: [[ENTRY:.*:]] +// CHECK-X64-NEXT: [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-X64-NEXT: [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-X64-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X64-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X64-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: store <4 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP3]], <4 x float> [[TMP4]], i8 2) +// CHECK-X64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-X64-NEXT: ret <4 x float> [[SHUFFLE_I]] +// __m128 test_cmpge_ss(__m128 a, __m128 b) { - // CHECK: @llvm.x86.sse.cmp.ss({{.*}}, i8 2) - // CHECK: shufflevector <{{.*}}, <4 x i32> return _mm_cmpge_ss(a, b); } +// CHECK-LABEL: define dso_local <4 x float> @test_cmpngt_ss( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP3]], <4 x float> [[TMP4]], i8 5) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// CHECK-X32-LABEL: define dso_local <4 x float> @test_cmpngt_ss( +// CHECK-X32-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-X32-NEXT: [[ENTRY:.*:]] +// CHECK-X32-NEXT: [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-X32-NEXT: [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-X32-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X32-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X32-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: store <4 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP3]], <4 x float> [[TMP4]], i8 5) +// CHECK-X32-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-X32-NEXT: ret <4 x float> [[SHUFFLE_I]] +// +// CHECK-X64-LABEL: define dso_local <4 x float> @test_cmpngt_ss( +// CHECK-X64-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-X64-NEXT: [[ENTRY:.*:]] +// CHECK-X64-NEXT: [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-X64-NEXT: [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-X64-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X64-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X64-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: store <4 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP3]], <4 x float> [[TMP4]], i8 5) +// CHECK-X64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-X64-NEXT: ret <4 x float> [[SHUFFLE_I]] +// __m128 test_cmpngt_ss(__m128 a, __m128 b) { - // CHECK: @llvm.x86.sse.cmp.ss({{.*}}, i8 5) - // CHECK: shufflevector <{{.*}}, <4 x i32> return _mm_cmpngt_ss(a, b); } +// CHECK-LABEL: define dso_local <4 x float> @test_cmpnge_ss( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP3]], <4 x float> [[TMP4]], i8 6) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// CHECK-X32-LABEL: define dso_local <4 x float> @test_cmpnge_ss( +// CHECK-X32-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-X32-NEXT: [[ENTRY:.*:]] +// CHECK-X32-NEXT: [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-X32-NEXT: [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-X32-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X32-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X32-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: store <4 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP3]], <4 x float> [[TMP4]], i8 6) +// CHECK-X32-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-X32-NEXT: ret <4 x float> [[SHUFFLE_I]] +// +// CHECK-X64-LABEL: define dso_local <4 x float> @test_cmpnge_ss( +// CHECK-X64-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-X64-NEXT: [[ENTRY:.*:]] +// CHECK-X64-NEXT: [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-X64-NEXT: [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-X64-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X64-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-X64-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: store <4 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP3]], <4 x float> [[TMP4]], i8 6) +// CHECK-X64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-X64-NEXT: ret <4 x float> [[SHUFFLE_I]] +// __m128 test_cmpnge_ss(__m128 a, __m128 b) { - // CHECK: @llvm.x86.sse.cmp.ss({{.*}}, i8 6) - // CHECK: shufflevector <{{.*}}, <4 x i32> return _mm_cmpnge_ss(a, b); } +// CHECK-LABEL: define dso_local <2 x double> @test_cmpgt_sd( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__C_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 1) +// CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[__C_I]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[__C_I]], align 16 +// CHECK-NEXT: [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0 +// CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1 +// CHECK-NEXT: store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-NEXT: ret <2 x double> [[TMP7]] +// CHECK-X32-LABEL: define dso_local <2 x double> @test_cmpgt_sd( +// CHECK-X32-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-X32-NEXT: [[ENTRY:.*:]] +// CHECK-X32-NEXT: [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[__C_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 1) +// CHECK-X32-NEXT: store <2 x double> [[TMP4]], ptr [[__C_I]], align 16 +// CHECK-X32-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[__C_I]], align 16 +// CHECK-X32-NEXT: [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +// CHECK-X32-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0 +// CHECK-X32-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +// CHECK-X32-NEXT: [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1 +// CHECK-X32-NEXT: store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-X32-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-X32-NEXT: ret <2 x double> [[TMP7]] +// +// CHECK-X64-LABEL: define dso_local <2 x double> @test_cmpgt_sd( +// CHECK-X64-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-X64-NEXT: [[ENTRY:.*:]] +// CHECK-X64-NEXT: [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[__C_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 1) +// CHECK-X64-NEXT: store <2 x double> [[TMP4]], ptr [[__C_I]], align 16 +// CHECK-X64-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[__C_I]], align 16 +// CHECK-X64-NEXT: [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +// CHECK-X64-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0 +// CHECK-X64-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +// CHECK-X64-NEXT: [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1 +// CHECK-X64-NEXT: store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-X64-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-X64-NEXT: ret <2 x double> [[TMP7]] +// __m128d test_cmpgt_sd(__m128d a, __m128d b) { - // CHECK: @llvm.x86.sse2.cmp.sd({{.*}}, i8 1) - // CHECK: shufflevector <{{.*}}, <2 x i32> return _mm_cmpgt_sd(a, b); } +// CHECK-LABEL: define dso_local <2 x double> @test_cmpge_sd( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__C_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 2) +// CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[__C_I]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[__C_I]], align 16 +// CHECK-NEXT: [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0 +// CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1 +// CHECK-NEXT: store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-NEXT: ret <2 x double> [[TMP7]] +// CHECK-X32-LABEL: define dso_local <2 x double> @test_cmpge_sd( +// CHECK-X32-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-X32-NEXT: [[ENTRY:.*:]] +// CHECK-X32-NEXT: [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[__C_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 2) +// CHECK-X32-NEXT: store <2 x double> [[TMP4]], ptr [[__C_I]], align 16 +// CHECK-X32-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[__C_I]], align 16 +// CHECK-X32-NEXT: [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +// CHECK-X32-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0 +// CHECK-X32-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +// CHECK-X32-NEXT: [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1 +// CHECK-X32-NEXT: store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-X32-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-X32-NEXT: ret <2 x double> [[TMP7]] +// +// CHECK-X64-LABEL: define dso_local <2 x double> @test_cmpge_sd( +// CHECK-X64-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-X64-NEXT: [[ENTRY:.*:]] +// CHECK-X64-NEXT: [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[__C_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 2) +// CHECK-X64-NEXT: store <2 x double> [[TMP4]], ptr [[__C_I]], align 16 +// CHECK-X64-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[__C_I]], align 16 +// CHECK-X64-NEXT: [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +// CHECK-X64-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0 +// CHECK-X64-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +// CHECK-X64-NEXT: [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1 +// CHECK-X64-NEXT: store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-X64-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-X64-NEXT: ret <2 x double> [[TMP7]] +// __m128d test_cmpge_sd(__m128d a, __m128d b) { - // CHECK: @llvm.x86.sse2.cmp.sd({{.*}}, i8 2) - // CHECK: shufflevector <{{.*}}, <2 x i32> return _mm_cmpge_sd(a, b); } +// CHECK-LABEL: define dso_local <2 x double> @test_cmpngt_sd( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__C_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 5) +// CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[__C_I]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[__C_I]], align 16 +// CHECK-NEXT: [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0 +// CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1 +// CHECK-NEXT: store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-NEXT: ret <2 x double> [[TMP7]] +// CHECK-X32-LABEL: define dso_local <2 x double> @test_cmpngt_sd( +// CHECK-X32-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-X32-NEXT: [[ENTRY:.*:]] +// CHECK-X32-NEXT: [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[__C_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 5) +// CHECK-X32-NEXT: store <2 x double> [[TMP4]], ptr [[__C_I]], align 16 +// CHECK-X32-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[__C_I]], align 16 +// CHECK-X32-NEXT: [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +// CHECK-X32-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0 +// CHECK-X32-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +// CHECK-X32-NEXT: [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1 +// CHECK-X32-NEXT: store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-X32-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-X32-NEXT: ret <2 x double> [[TMP7]] +// +// CHECK-X64-LABEL: define dso_local <2 x double> @test_cmpngt_sd( +// CHECK-X64-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-X64-NEXT: [[ENTRY:.*:]] +// CHECK-X64-NEXT: [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[__C_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 5) +// CHECK-X64-NEXT: store <2 x double> [[TMP4]], ptr [[__C_I]], align 16 +// CHECK-X64-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[__C_I]], align 16 +// CHECK-X64-NEXT: [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +// CHECK-X64-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0 +// CHECK-X64-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +// CHECK-X64-NEXT: [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1 +// CHECK-X64-NEXT: store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-X64-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-X64-NEXT: ret <2 x double> [[TMP7]] +// __m128d test_cmpngt_sd(__m128d a, __m128d b) { - // CHECK: @llvm.x86.sse2.cmp.sd({{.*}}, i8 5) - // CHECK: shufflevector <{{.*}}, <2 x i32> return _mm_cmpngt_sd(a, b); } +// CHECK-LABEL: define dso_local <2 x double> @test_cmpnge_sd( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__C_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 6) +// CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[__C_I]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[__C_I]], align 16 +// CHECK-NEXT: [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0 +// CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-NEXT: [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1 +// CHECK-NEXT: store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-NEXT: ret <2 x double> [[TMP7]] +// CHECK-X32-LABEL: define dso_local <2 x double> @test_cmpnge_sd( +// CHECK-X32-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-X32-NEXT: [[ENTRY:.*:]] +// CHECK-X32-NEXT: [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[__C_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X32-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-X32-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 +// CHECK-X32-NEXT: store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 6) +// CHECK-X32-NEXT: store <2 x double> [[TMP4]], ptr [[__C_I]], align 16 +// CHECK-X32-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[__C_I]], align 16 +// CHECK-X32-NEXT: [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +// CHECK-X32-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0 +// CHECK-X32-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X32-NEXT: [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +// CHECK-X32-NEXT: [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1 +// CHECK-X32-NEXT: store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-X32-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-X32-NEXT: ret <2 x double> [[TMP7]] +// +// CHECK-X64-LABEL: define dso_local <2 x double> @test_cmpnge_sd( +// CHECK-X64-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-X64-NEXT: [[ENTRY:.*:]] +// CHECK-X64-NEXT: [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[__C_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-X64-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-X64-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 +// CHECK-X64-NEXT: store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 6) +// CHECK-X64-NEXT: store <2 x double> [[TMP4]], ptr [[__C_I]], align 16 +// CHECK-X64-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[__C_I]], align 16 +// CHECK-X64-NEXT: [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +// CHECK-X64-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0 +// CHECK-X64-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16 +// CHECK-X64-NEXT: [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +// CHECK-X64-NEXT: [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1 +// CHECK-X64-NEXT: store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-X64-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-X64-NEXT: ret <2 x double> [[TMP7]] +// __m128d test_cmpnge_sd(__m128d a, __m128d b) { - // CHECK: @llvm.x86.sse2.cmp.sd({{.*}}, i8 6) - // CHECK: shufflevector <{{.*}}, <2 x i32> return _mm_cmpnge_sd(a, b); } diff --git a/clang/test/CodeGen/X86/avx-shuffle-builtins.c b/clang/test/CodeGen/X86/avx-shuffle-builtins.c index d184d28f3e07a..c06ce80e82ca6 100644 --- a/clang/test/CodeGen/X86/avx-shuffle-builtins.c +++ b/clang/test/CodeGen/X86/avx-shuffle-builtins.c @@ -1,7 +1,7 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // REQUIRES: x86-registered-target -// RUN: %clang_cc1 -ffreestanding %s -O3 -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,X64 -// RUN: %clang_cc1 -ffreestanding %s -O3 -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,X86 -// FIXME: This is testing optimized generation of shuffle instructions and should be fixed. +// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-- -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,X64 +// RUN: %clang_cc1 -ffreestanding %s -triple=i386-- -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,X86 #include @@ -10,201 +10,662 @@ // Test LLVM IR codegen of shuffle instructions, checking if the masks are correct // +// CHECK-LABEL: define dso_local <8 x float> @x( +// CHECK-SAME: <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: store <8 x float> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <8 x float> [[B]], ptr [[B_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32 +// CHECK-NEXT: [[SHUFP:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[TMP1]], <8 x i32> +// CHECK-NEXT: ret <8 x float> [[SHUFP]] +// __m256 x(__m256 a, __m256 b) { - // CHECK-LABEL: x - // CHECK: shufflevector{{.*}} return _mm256_shuffle_ps(a, b, 203); } +// CHECK-LABEL: define dso_local <2 x double> @test_mm_permute_pd( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +// CHECK-NEXT: ret <2 x double> [[PERMIL]] +// __m128d test_mm_permute_pd(__m128d a) { - // CHECK-LABEL: test_mm_permute_pd - // CHECK: shufflevector{{.*}} return _mm_permute_pd(a, 1); } +// CHECK-LABEL: define dso_local <4 x double> @test_mm256_permute_pd( +// CHECK-SAME: <4 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x double>, align 32 +// CHECK-NEXT: store <4 x double> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> +// CHECK-NEXT: ret <4 x double> [[PERMIL]] +// __m256d test_mm256_permute_pd(__m256d a) { - // CHECK-LABEL: test_mm256_permute_pd - // CHECK: shufflevector{{.*}} return _mm256_permute_pd(a, 5); } +// CHECK-LABEL: define dso_local <4 x float> @test_mm_permute_ps( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> +// CHECK-NEXT: ret <4 x float> [[PERMIL]] +// __m128 test_mm_permute_ps(__m128 a) { - // CHECK-LABEL: test_mm_permute_ps - // CHECK: shufflevector{{.*}} return _mm_permute_ps(a, 0x1b); } // Test case for PR12401 +// CHECK-LABEL: define dso_local <4 x float> @test_mm_permute_ps2( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> +// CHECK-NEXT: ret <4 x float> [[PERMIL]] +// __m128 test_mm_permute_ps2(__m128 a) { - // CHECK-LABEL: test_mm_permute_ps2 - // CHECK: shufflevector{{.*}} return _mm_permute_ps(a, 0xe6); } +// CHECK-LABEL: define dso_local <8 x float> @test_mm256_permute_ps( +// CHECK-SAME: <8 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: store <8 x float> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> +// CHECK-NEXT: ret <8 x float> [[PERMIL]] +// __m256 test_mm256_permute_ps(__m256 a) { - // CHECK-LABEL: test_mm256_permute_ps - // CHECK: shufflevector{{.*}} return _mm256_permute_ps(a, 0x1b); } +// CHECK-LABEL: define dso_local <4 x double> @test_mm256_permute2f128_pd( +// CHECK-SAME: <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x double>, align 32 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x double>, align 32 +// CHECK-NEXT: store <4 x double> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <4 x double> [[B]], ptr [[B_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32 +// CHECK-NEXT: [[VPERM:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <4 x i32> +// CHECK-NEXT: ret <4 x double> [[VPERM]] +// __m256d test_mm256_permute2f128_pd(__m256d a, __m256d b) { - // CHECK-LABEL: test_mm256_permute2f128_pd - // CHECK: shufflevector{{.*}} return _mm256_permute2f128_pd(a, b, 0x31); } +// CHECK-LABEL: define dso_local <8 x float> @test_mm256_permute2f128_ps( +// CHECK-SAME: <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: store <8 x float> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <8 x float> [[B]], ptr [[B_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32 +// CHECK-NEXT: [[VPERM:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP0]], <8 x i32> +// CHECK-NEXT: ret <8 x float> [[VPERM]] +// __m256 test_mm256_permute2f128_ps(__m256 a, __m256 b) { - // CHECK-LABEL: test_mm256_permute2f128_ps - // CHECK: shufflevector{{.*}} return _mm256_permute2f128_ps(a, b, 0x13); } +// CHECK-LABEL: define dso_local <4 x i64> @test_mm256_permute2f128_si256( +// CHECK-SAME: <4 x i64> noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x i64>, align 32 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x i64>, align 32 +// CHECK-NEXT: store <4 x i64> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <4 x i64> [[B]], ptr [[B_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[TMP0]] to <8 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr [[B_ADDR]], align 32 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to <8 x i32> +// CHECK-NEXT: [[VPERM:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[VPERM]] to <4 x i64> +// CHECK-NEXT: ret <4 x i64> [[TMP4]] +// __m256i test_mm256_permute2f128_si256(__m256i a, __m256i b) { - // CHECK-LABEL: test_mm256_permute2f128_si256 - // CHECK: shufflevector{{.*}} return _mm256_permute2f128_si256(a, b, 0x20); } __m128 +// X64-LABEL: define dso_local <4 x float> @test_mm_broadcast_ss( +// X64-SAME: ptr noundef [[__A:%.*]]) #[[ATTR1]] { +// X64-NEXT: [[ENTRY:.*:]] +// X64-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8 +// X64-NEXT: [[__F_I:%.*]] = alloca float, align 4 +// X64-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x float>, align 16 +// X64-NEXT: [[__A_ADDR:%.*]] = alloca ptr, align 8 +// X64-NEXT: store ptr [[__A]], ptr [[__A_ADDR]], align 8 +// X64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__A_ADDR]], align 8 +// X64-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8 +// X64-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8 +// X64-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 1 +// X64-NEXT: store float [[TMP2]], ptr [[__F_I]], align 4 +// X64-NEXT: [[TMP3:%.*]] = load float, ptr [[__F_I]], align 4 +// X64-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0 +// X64-NEXT: [[TMP4:%.*]] = load float, ptr [[__F_I]], align 4 +// X64-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[TMP4]], i32 1 +// X64-NEXT: [[TMP5:%.*]] = load float, ptr [[__F_I]], align 4 +// X64-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[TMP5]], i32 2 +// X64-NEXT: [[TMP6:%.*]] = load float, ptr [[__F_I]], align 4 +// X64-NEXT: [[VECINIT4_I:%.*]] = insertelement <4 x float> [[VECINIT3_I]], float [[TMP6]], i32 3 +// X64-NEXT: store <4 x float> [[VECINIT4_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// X64-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// X64-NEXT: ret <4 x float> [[TMP7]] +// +// X86-LABEL: define dso_local <4 x float> @test_mm_broadcast_ss( +// X86-SAME: ptr noundef [[__A:%.*]]) #[[ATTR1]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[__F_I:%.*]] = alloca float, align 4 +// X86-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x float>, align 16 +// X86-NEXT: [[__A_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[__A]], ptr [[__A_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__A_ADDR]], align 4 +// X86-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 4 +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 4 +// X86-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 1 +// X86-NEXT: store float [[TMP2]], ptr [[__F_I]], align 4 +// X86-NEXT: [[TMP3:%.*]] = load float, ptr [[__F_I]], align 4 +// X86-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0 +// X86-NEXT: [[TMP4:%.*]] = load float, ptr [[__F_I]], align 4 +// X86-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[TMP4]], i32 1 +// X86-NEXT: [[TMP5:%.*]] = load float, ptr [[__F_I]], align 4 +// X86-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[TMP5]], i32 2 +// X86-NEXT: [[TMP6:%.*]] = load float, ptr [[__F_I]], align 4 +// X86-NEXT: [[VECINIT4_I:%.*]] = insertelement <4 x float> [[VECINIT3_I]], float [[TMP6]], i32 3 +// X86-NEXT: store <4 x float> [[VECINIT4_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// X86-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// X86-NEXT: ret <4 x float> [[TMP7]] +// test_mm_broadcast_ss(float const *__a) { - // CHECK-LABEL: test_mm_broadcast_ss - // CHECK: insertelement <4 x float> {{.*}}, i64 0 - // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> poison, <4 x i32> zeroinitializer return _mm_broadcast_ss(__a); } __m256d +// X64-LABEL: define dso_local <4 x double> @test_mm256_broadcast_sd( +// X64-SAME: ptr noundef [[__A:%.*]]) #[[ATTR0]] { +// X64-NEXT: [[ENTRY:.*:]] +// X64-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8 +// X64-NEXT: [[__D_I:%.*]] = alloca double, align 8 +// X64-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x double>, align 32 +// X64-NEXT: [[__A_ADDR:%.*]] = alloca ptr, align 8 +// X64-NEXT: store ptr [[__A]], ptr [[__A_ADDR]], align 8 +// X64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__A_ADDR]], align 8 +// X64-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8 +// X64-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8 +// X64-NEXT: [[TMP2:%.*]] = load double, ptr [[TMP1]], align 1 +// X64-NEXT: store double [[TMP2]], ptr [[__D_I]], align 8 +// X64-NEXT: [[TMP3:%.*]] = load double, ptr [[__D_I]], align 8 +// X64-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x double> poison, double [[TMP3]], i32 0 +// X64-NEXT: [[TMP4:%.*]] = load double, ptr [[__D_I]], align 8 +// X64-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x double> [[VECINIT_I]], double [[TMP4]], i32 1 +// X64-NEXT: [[TMP5:%.*]] = load double, ptr [[__D_I]], align 8 +// X64-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x double> [[VECINIT2_I]], double [[TMP5]], i32 2 +// X64-NEXT: [[TMP6:%.*]] = load double, ptr [[__D_I]], align 8 +// X64-NEXT: [[VECINIT4_I:%.*]] = insertelement <4 x double> [[VECINIT3_I]], double [[TMP6]], i32 3 +// X64-NEXT: store <4 x double> [[VECINIT4_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 32 +// X64-NEXT: [[TMP7:%.*]] = load <4 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32 +// X64-NEXT: ret <4 x double> [[TMP7]] +// +// X86-LABEL: define dso_local <4 x double> @test_mm256_broadcast_sd( +// X86-SAME: ptr noundef [[__A:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[__D_I:%.*]] = alloca double, align 8 +// X86-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x double>, align 32 +// X86-NEXT: [[__A_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[__A]], ptr [[__A_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__A_ADDR]], align 4 +// X86-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 4 +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 4 +// X86-NEXT: [[TMP2:%.*]] = load double, ptr [[TMP1]], align 1 +// X86-NEXT: store double [[TMP2]], ptr [[__D_I]], align 8 +// X86-NEXT: [[TMP3:%.*]] = load double, ptr [[__D_I]], align 8 +// X86-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x double> poison, double [[TMP3]], i32 0 +// X86-NEXT: [[TMP4:%.*]] = load double, ptr [[__D_I]], align 8 +// X86-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x double> [[VECINIT_I]], double [[TMP4]], i32 1 +// X86-NEXT: [[TMP5:%.*]] = load double, ptr [[__D_I]], align 8 +// X86-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x double> [[VECINIT2_I]], double [[TMP5]], i32 2 +// X86-NEXT: [[TMP6:%.*]] = load double, ptr [[__D_I]], align 8 +// X86-NEXT: [[VECINIT4_I:%.*]] = insertelement <4 x double> [[VECINIT3_I]], double [[TMP6]], i32 3 +// X86-NEXT: store <4 x double> [[VECINIT4_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 32 +// X86-NEXT: [[TMP7:%.*]] = load <4 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32 +// X86-NEXT: ret <4 x double> [[TMP7]] +// test_mm256_broadcast_sd(double const *__a) { - // CHECK-LABEL: test_mm256_broadcast_sd - // CHECK: insertelement <4 x double> {{.*}}, i64 0 - // CHECK: shufflevector <4 x double> {{.*}}, <4 x double> poison, <4 x i32> zeroinitializer return _mm256_broadcast_sd(__a); } __m256 +// X64-LABEL: define dso_local <8 x float> @test_mm256_broadcast_ss( +// X64-SAME: ptr noundef [[__A:%.*]]) #[[ATTR0]] { +// X64-NEXT: [[ENTRY:.*:]] +// X64-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8 +// X64-NEXT: [[__F_I:%.*]] = alloca float, align 4 +// X64-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x float>, align 32 +// X64-NEXT: [[__A_ADDR:%.*]] = alloca ptr, align 8 +// X64-NEXT: store ptr [[__A]], ptr [[__A_ADDR]], align 8 +// X64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__A_ADDR]], align 8 +// X64-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8 +// X64-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8 +// X64-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 1 +// X64-NEXT: store float [[TMP2]], ptr [[__F_I]], align 4 +// X64-NEXT: [[TMP3:%.*]] = load float, ptr [[__F_I]], align 4 +// X64-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 +// X64-NEXT: [[TMP4:%.*]] = load float, ptr [[__F_I]], align 4 +// X64-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP4]], i32 1 +// X64-NEXT: [[TMP5:%.*]] = load float, ptr [[__F_I]], align 4 +// X64-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP5]], i32 2 +// X64-NEXT: [[TMP6:%.*]] = load float, ptr [[__F_I]], align 4 +// X64-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP6]], i32 3 +// X64-NEXT: [[TMP7:%.*]] = load float, ptr [[__F_I]], align 4 +// X64-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP7]], i32 4 +// X64-NEXT: [[TMP8:%.*]] = load float, ptr [[__F_I]], align 4 +// X64-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP8]], i32 5 +// X64-NEXT: [[TMP9:%.*]] = load float, ptr [[__F_I]], align 4 +// X64-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP9]], i32 6 +// X64-NEXT: [[TMP10:%.*]] = load float, ptr [[__F_I]], align 4 +// X64-NEXT: [[VECINIT8_I:%.*]] = insertelement <8 x float> [[VECINIT7_I]], float [[TMP10]], i32 7 +// X64-NEXT: store <8 x float> [[VECINIT8_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 32 +// X64-NEXT: [[TMP11:%.*]] = load <8 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32 +// X64-NEXT: ret <8 x float> [[TMP11]] +// +// X86-LABEL: define dso_local <8 x float> @test_mm256_broadcast_ss( +// X86-SAME: ptr noundef [[__A:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[__F_I:%.*]] = alloca float, align 4 +// X86-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x float>, align 32 +// X86-NEXT: [[__A_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[__A]], ptr [[__A_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__A_ADDR]], align 4 +// X86-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 4 +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 4 +// X86-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 1 +// X86-NEXT: store float [[TMP2]], ptr [[__F_I]], align 4 +// X86-NEXT: [[TMP3:%.*]] = load float, ptr [[__F_I]], align 4 +// X86-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 +// X86-NEXT: [[TMP4:%.*]] = load float, ptr [[__F_I]], align 4 +// X86-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP4]], i32 1 +// X86-NEXT: [[TMP5:%.*]] = load float, ptr [[__F_I]], align 4 +// X86-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP5]], i32 2 +// X86-NEXT: [[TMP6:%.*]] = load float, ptr [[__F_I]], align 4 +// X86-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP6]], i32 3 +// X86-NEXT: [[TMP7:%.*]] = load float, ptr [[__F_I]], align 4 +// X86-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP7]], i32 4 +// X86-NEXT: [[TMP8:%.*]] = load float, ptr [[__F_I]], align 4 +// X86-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP8]], i32 5 +// X86-NEXT: [[TMP9:%.*]] = load float, ptr [[__F_I]], align 4 +// X86-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP9]], i32 6 +// X86-NEXT: [[TMP10:%.*]] = load float, ptr [[__F_I]], align 4 +// X86-NEXT: [[VECINIT8_I:%.*]] = insertelement <8 x float> [[VECINIT7_I]], float [[TMP10]], i32 7 +// X86-NEXT: store <8 x float> [[VECINIT8_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 32 +// X86-NEXT: [[TMP11:%.*]] = load <8 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32 +// X86-NEXT: ret <8 x float> [[TMP11]] +// test_mm256_broadcast_ss(float const *__a) { - // CHECK-LABEL: test_mm256_broadcast_ss - // CHECK: insertelement <8 x float> {{.*}}, i64 0 - // CHECK: shufflevector <8 x float> {{.*}}, <8 x float> poison, <8 x i32> zeroinitializer return _mm256_broadcast_ss(__a); } // Make sure we have the correct mask for each insertf128 case. +// CHECK-LABEL: define dso_local <8 x float> @test_mm256_insertf128_ps_0( +// CHECK-SAME: <8 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: store <8 x float> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[WIDEN:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> +// CHECK-NEXT: [[INSERT:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[WIDEN]], <8 x i32> +// CHECK-NEXT: ret <8 x float> [[INSERT]] +// __m256 test_mm256_insertf128_ps_0(__m256 a, __m128 b) { - // CHECK-LABEL: test_mm256_insertf128_ps_0 - // CHECK: shufflevector{{.*}} return _mm256_insertf128_ps(a, b, 0); } +// CHECK-LABEL: define dso_local <4 x double> @test_mm256_insertf128_pd_0( +// CHECK-SAME: <4 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x double>, align 32 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <4 x double> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[WIDEN:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> +// CHECK-NEXT: [[INSERT:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[WIDEN]], <4 x i32> +// CHECK-NEXT: ret <4 x double> [[INSERT]] +// __m256d test_mm256_insertf128_pd_0(__m256d a, __m128d b) { - // CHECK-LABEL: test_mm256_insertf128_pd_0 - // CHECK: shufflevector{{.*}} return _mm256_insertf128_pd(a, b, 0); } +// CHECK-LABEL: define dso_local <4 x i64> @test_mm256_insertf128_si256_0( +// CHECK-SAME: <4 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x i64>, align 32 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: store <4 x i64> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <2 x i64> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[TMP0]] to <8 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[WIDEN:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +// CHECK-NEXT: [[INSERT:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[WIDEN]], <8 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[INSERT]] to <4 x i64> +// CHECK-NEXT: ret <4 x i64> [[TMP4]] +// __m256i test_mm256_insertf128_si256_0(__m256i a, __m128i b) { - // CHECK-LABEL: test_mm256_insertf128_si256_0 - // X64: shufflevector{{.*}} - // X86: shufflevector{{.*}} return _mm256_insertf128_si256(a, b, 0); } +// CHECK-LABEL: define dso_local <8 x float> @test_mm256_insertf128_ps_1( +// CHECK-SAME: <8 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: store <8 x float> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[WIDEN:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> +// CHECK-NEXT: [[INSERT:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[WIDEN]], <8 x i32> +// CHECK-NEXT: ret <8 x float> [[INSERT]] +// __m256 test_mm256_insertf128_ps_1(__m256 a, __m128 b) { - // CHECK-LABEL: test_mm256_insertf128_ps_1 - // CHECK: shufflevector{{.*}} return _mm256_insertf128_ps(a, b, 1); } +// CHECK-LABEL: define dso_local <4 x double> @test_mm256_insertf128_pd_1( +// CHECK-SAME: <4 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x double>, align 32 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <4 x double> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[WIDEN:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> +// CHECK-NEXT: [[INSERT:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[WIDEN]], <4 x i32> +// CHECK-NEXT: ret <4 x double> [[INSERT]] +// __m256d test_mm256_insertf128_pd_1(__m256d a, __m128d b) { - // CHECK-LABEL: test_mm256_insertf128_pd_1 - // CHECK: shufflevector{{.*}} return _mm256_insertf128_pd(a, b, 1); } +// CHECK-LABEL: define dso_local <4 x i64> @test_mm256_insertf128_si256_1( +// CHECK-SAME: <4 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x i64>, align 32 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: store <4 x i64> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <2 x i64> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[TMP0]] to <8 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[WIDEN:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +// CHECK-NEXT: [[INSERT:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[WIDEN]], <8 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[INSERT]] to <4 x i64> +// CHECK-NEXT: ret <4 x i64> [[TMP4]] +// __m256i test_mm256_insertf128_si256_1(__m256i a, __m128i b) { - // CHECK-LABEL: test_mm256_insertf128_si256_1 - // X64: shufflevector{{.*}} - // X86: shufflevector{{.*}} return _mm256_insertf128_si256(a, b, 1); } // Make sure we have the correct mask for each extractf128 case. +// CHECK-LABEL: define dso_local <4 x float> @test_mm256_extractf128_ps_0( +// CHECK-SAME: <8 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: store <8 x float> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> +// CHECK-NEXT: ret <4 x float> [[EXTRACT]] +// __m128 test_mm256_extractf128_ps_0(__m256 a) { - // X64-LABEL: test_mm256_extractf128_ps_0 - // X64: shufflevector{{.*}} // - // X86-LABEL: test_mm256_extractf128_ps_0 - // X86: shufflevector{{.*}} return _mm256_extractf128_ps(a, 0); } +// CHECK-LABEL: define dso_local <2 x double> @test_mm256_extractf128_pd_0( +// CHECK-SAME: <4 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x double>, align 32 +// CHECK-NEXT: store <4 x double> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> +// CHECK-NEXT: ret <2 x double> [[EXTRACT]] +// __m128d test_mm256_extractf128_pd_0(__m256d a) { - // CHECK-LABEL: test_mm256_extractf128_pd_0 - // CHECK: shufflevector{{.*}} return _mm256_extractf128_pd(a, 0); } +// CHECK-LABEL: define dso_local <2 x i64> @test_mm256_extractf128_si256_0( +// CHECK-SAME: <4 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x i64>, align 32 +// CHECK-NEXT: store <4 x i64> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[TMP0]] to <8 x i32> +// CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[EXTRACT]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// __m128i test_mm256_extractf128_si256_0(__m256i a) { - // CHECK-LABEL: test_mm256_extractf128_si256_0 - // CHECK: shufflevector{{.*}} return _mm256_extractf128_si256(a, 0); } +// CHECK-LABEL: define dso_local <4 x float> @test_mm256_extractf128_ps_1( +// CHECK-SAME: <8 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: store <8 x float> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> +// CHECK-NEXT: ret <4 x float> [[EXTRACT]] +// __m128 test_mm256_extractf128_ps_1(__m256 a) { - // X64-LABEL: test_mm256_extractf128_ps_1 - // X64: shufflevector{{.*}} // - // X86-LABEL: test_mm256_extractf128_ps_1 - // X86: shufflevector{{.*}} return _mm256_extractf128_ps(a, 1); } +// CHECK-LABEL: define dso_local <2 x double> @test_mm256_extractf128_pd_1( +// CHECK-SAME: <4 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x double>, align 32 +// CHECK-NEXT: store <4 x double> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> +// CHECK-NEXT: ret <2 x double> [[EXTRACT]] +// __m128d test_mm256_extractf128_pd_1(__m256d a) { - // CHECK-LABEL: test_mm256_extractf128_pd_1 - // CHECK: shufflevector{{.*}} return _mm256_extractf128_pd(a, 1); } +// CHECK-LABEL: define dso_local <2 x i64> @test_mm256_extractf128_si256_1( +// CHECK-SAME: <4 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x i64>, align 32 +// CHECK-NEXT: store <4 x i64> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[TMP0]] to <8 x i32> +// CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[EXTRACT]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// __m128i test_mm256_extractf128_si256_1(__m256i a) { - // CHECK-LABEL: test_mm256_extractf128_si256_1 - // CHECK: shufflevector{{.*}} return _mm256_extractf128_si256(a, 1); } +// CHECK-LABEL: define dso_local <8 x float> @test_mm256_set_m128( +// CHECK-SAME: <4 x float> noundef [[HI:%.*]], <4 x float> noundef [[LO:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__HI_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[__LO_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[HI_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[LO_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: store <4 x float> [[HI]], ptr [[HI_ADDR]], align 16 +// CHECK-NEXT: store <4 x float> [[LO]], ptr [[LO_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[HI_ADDR]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[LO_ADDR]], align 16 +// CHECK-NEXT: store <4 x float> [[TMP0]], ptr [[__HI_ADDR_I]], align 16 +// CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[__LO_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__LO_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__HI_ADDR_I]], align 16 +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <8 x i32> +// CHECK-NEXT: ret <8 x float> [[SHUFFLE_I]] +// __m256 test_mm256_set_m128(__m128 hi, __m128 lo) { - // CHECK-LABEL: test_mm256_set_m128 - // CHECK: shufflevector{{.*}} return _mm256_set_m128(hi, lo); } +// CHECK-LABEL: define dso_local <4 x double> @test_mm256_set_m128d( +// CHECK-SAME: <2 x double> noundef [[HI:%.*]], <2 x double> noundef [[LO:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__HI_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__LO_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[HI_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[LO_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <2 x double> [[HI]], ptr [[HI_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[LO]], ptr [[LO_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[HI_ADDR]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[LO_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP0]], ptr [[__HI_ADDR_I]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[__LO_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[__LO_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__HI_ADDR_I]], align 16 +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <4 x i32> +// CHECK-NEXT: ret <4 x double> [[SHUFFLE_I]] +// __m256d test_mm256_set_m128d(__m128d hi, __m128d lo) { - // CHECK-LABEL: test_mm256_set_m128d - // CHECK: shufflevector{{.*}} return _mm256_set_m128d(hi, lo); } +// CHECK-LABEL: define dso_local <4 x i64> @test_mm256_set_m128i( +// CHECK-SAME: <2 x i64> noundef [[HI:%.*]], <2 x i64> noundef [[LO:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[__LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[HI_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[LO_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: store <2 x i64> [[HI]], ptr [[HI_ADDR]], align 16 +// CHECK-NEXT: store <2 x i64> [[LO]], ptr [[LO_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[HI_ADDR]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[LO_ADDR]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP0]], ptr [[__HI_ADDR_I]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[__LO_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[__LO_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[__HI_ADDR_I]], align 16 +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> +// CHECK-NEXT: ret <4 x i64> [[SHUFFLE_I]] +// __m256i test_mm256_set_m128i(__m128i hi, __m128i lo) { - // CHECK-LABEL: test_mm256_set_m128i - // CHECK: shufflevector{{.*}} return _mm256_set_m128i(hi, lo); } +// CHECK-LABEL: define dso_local <8 x float> @test_mm256_setr_m128( +// CHECK-SAME: <4 x float> noundef [[HI:%.*]], <4 x float> noundef [[LO:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__HI_ADDR_I_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[__LO_ADDR_I_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[__LO_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[__HI_ADDR_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[HI_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[LO_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: store <4 x float> [[HI]], ptr [[HI_ADDR]], align 16 +// CHECK-NEXT: store <4 x float> [[LO]], ptr [[LO_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[LO_ADDR]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[HI_ADDR]], align 16 +// CHECK-NEXT: store <4 x float> [[TMP0]], ptr [[__LO_ADDR_I]], align 16 +// CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[__HI_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__HI_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__LO_ADDR_I]], align 16 +// CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[__HI_ADDR_I_I]], align 16 +// CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[__LO_ADDR_I_I]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__LO_ADDR_I_I]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[__HI_ADDR_I_I]], align 16 +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <8 x i32> +// CHECK-NEXT: ret <8 x float> [[SHUFFLE_I_I]] +// __m256 test_mm256_setr_m128(__m128 hi, __m128 lo) { - // CHECK-LABEL: test_mm256_setr_m128 - // CHECK: shufflevector{{.*}} return _mm256_setr_m128(lo, hi); } +// CHECK-LABEL: define dso_local <4 x double> @test_mm256_setr_m128d( +// CHECK-SAME: <2 x double> noundef [[HI:%.*]], <2 x double> noundef [[LO:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__HI_ADDR_I_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__LO_ADDR_I_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__LO_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__HI_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[HI_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[LO_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <2 x double> [[HI]], ptr [[HI_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[LO]], ptr [[LO_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[LO_ADDR]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[HI_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP0]], ptr [[__LO_ADDR_I]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[__HI_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[__HI_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__LO_ADDR_I]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[__HI_ADDR_I_I]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[__LO_ADDR_I_I]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[__LO_ADDR_I_I]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[__HI_ADDR_I_I]], align 16 +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <4 x i32> +// CHECK-NEXT: ret <4 x double> [[SHUFFLE_I_I]] +// __m256d test_mm256_setr_m128d(__m128d hi, __m128d lo) { - // CHECK-LABEL: test_mm256_setr_m128d - // CHECK: shufflevector{{.*}} return _mm256_setr_m128d(lo, hi); } +// CHECK-LABEL: define dso_local <4 x i64> @test_mm256_setr_m128i( +// CHECK-SAME: <2 x i64> noundef [[HI:%.*]], <2 x i64> noundef [[LO:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__HI_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[__LO_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[__LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[__HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[HI_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[LO_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: store <2 x i64> [[HI]], ptr [[HI_ADDR]], align 16 +// CHECK-NEXT: store <2 x i64> [[LO]], ptr [[LO_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[LO_ADDR]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[HI_ADDR]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP0]], ptr [[__LO_ADDR_I]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[__HI_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[__HI_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[__LO_ADDR_I]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP2]], ptr [[__HI_ADDR_I_I]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP3]], ptr [[__LO_ADDR_I_I]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[__LO_ADDR_I_I]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr [[__HI_ADDR_I_I]], align 16 +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> +// CHECK-NEXT: ret <4 x i64> [[SHUFFLE_I_I]] +// __m256i test_mm256_setr_m128i(__m128i hi, __m128i lo) { - // CHECK-LABEL: test_mm256_setr_m128i - // CHECK: shufflevector{{.*}} return _mm256_setr_m128i(lo, hi); } diff --git a/clang/test/CodeGen/X86/sse.c b/clang/test/CodeGen/X86/sse.c index a75b8dc77e86e..01521658a00bd 100644 --- a/clang/test/CodeGen/X86/sse.c +++ b/clang/test/CodeGen/X86/sse.c @@ -1,42 +1,90 @@ -// RUN: %clang_cc1 -ffreestanding -O3 -triple x86_64-apple-macosx10.8.0 -target-feature +sse4.1 -emit-llvm %s -o - | FileCheck %s -// FIXME: This test currently depends on optimization - it should be rewritten to avoid it. +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -ffreestanding -triple x86_64-- -target-feature +sse4.1 -emit-llvm %s -o - | FileCheck %s #include // Byte-shifts look reversed due to xmm register layout +// CHECK-LABEL: define dso_local <2 x i64> @test_mm_slli_si128( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: store <2 x i64> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[PSLLDQ:%.*]] = shufflevector <16 x i8> zeroinitializer, <16 x i8> [[CAST]], <16 x i32> +// CHECK-NEXT: [[CAST1:%.*]] = bitcast <16 x i8> [[PSLLDQ]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[CAST1]] +// __m128i test_mm_slli_si128(__m128i a) { - // CHECK-LABEL: @test_mm_slli_si128 - // CHECK: shufflevector <16 x i8> <{{.*}}, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> {{.*}}, <16 x i32> return _mm_slli_si128(a, 5); } +// CHECK-LABEL: define dso_local <2 x i64> @test_mm_slli_si128_0( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: store <2 x i64> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[PSLLDQ:%.*]] = shufflevector <16 x i8> zeroinitializer, <16 x i8> [[CAST]], <16 x i32> +// CHECK-NEXT: [[CAST1:%.*]] = bitcast <16 x i8> [[PSLLDQ]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[CAST1]] +// __m128i test_mm_slli_si128_0(__m128i a) { - // CHECK-LABEL: @test_mm_slli_si128_0 - // CHECK-NOT: shufflevector return _mm_slli_si128(a, 0); } +// CHECK-LABEL: define dso_local <2 x i64> @test_mm_slli_si128_16( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: store <2 x i64> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: ret <2 x i64> zeroinitializer +// __m128i test_mm_slli_si128_16(__m128i a) { - // CHECK-LABEL: @test_mm_slli_si128_16 - // CHECK-NOT: shufflevector return _mm_slli_si128(a, 16); } +// CHECK-LABEL: define dso_local <2 x i64> @test_mm_srli_si128( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: store <2 x i64> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[PSRLDQ:%.*]] = shufflevector <16 x i8> [[CAST]], <16 x i8> zeroinitializer, <16 x i32> +// CHECK-NEXT: [[CAST1:%.*]] = bitcast <16 x i8> [[PSRLDQ]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[CAST1]] +// __m128i test_mm_srli_si128(__m128i a) { - // CHECK-LABEL: @test_mm_srli_si128 - // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> , <16 x i32> return _mm_srli_si128(a, 5); } +// CHECK-LABEL: define dso_local <2 x i64> @test_mm_srli_si128_0( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: store <2 x i64> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[PSRLDQ:%.*]] = shufflevector <16 x i8> [[CAST]], <16 x i8> zeroinitializer, <16 x i32> +// CHECK-NEXT: [[CAST1:%.*]] = bitcast <16 x i8> [[PSRLDQ]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[CAST1]] +// __m128i test_mm_srli_si128_0(__m128i a) { - // CHECK-LABEL: @test_mm_srli_si128_0 - // CHECK-NOT: shufflevector return _mm_srli_si128(a, 0); } +// CHECK-LABEL: define dso_local <2 x i64> @test_mm_srli_si128_16( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: store <2 x i64> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: ret <2 x i64> zeroinitializer +// __m128i test_mm_srli_si128_16(__m128i a) { - // CHECK-LABEL: @test_mm_srli_si128_16 - // CHECK-NOT: shufflevector return _mm_srli_si128(a, 16); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index d68ae64f08aa9..cd08a4e834855 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -2248,13 +2248,15 @@ Instruction *InstCombinerImpl::foldSelectShuffle(ShuffleVectorInst &Shuf) { if (!Shuf.isSelect()) return nullptr; + Value *Op0 = Shuf.getOperand(0); + Value *Op1 = Shuf.getOperand(1); + // Canonicalize to choose from operand 0 first unless operand 1 is undefined. - // Commuting undef to operand 0 conflicts with another canonicalization. + // Only do so when the operand have the same complexity to avoid conflict with + // complexity normalization. unsigned NumElts = cast(Shuf.getType())->getNumElements(); - if (!match(Shuf.getOperand(1), m_Undef()) && - Shuf.getMaskValue(0) >= (int)NumElts) { - // TODO: Can we assert that both operands of a shuffle-select are not undef - // (otherwise, it would have been folded by instsimplify? + if (Shuf.getMaskValue(0) >= (int)NumElts && + getComplexity(Op0) == getComplexity(Op1)) { Shuf.commute(); return &Shuf; } @@ -2267,8 +2269,7 @@ Instruction *InstCombinerImpl::foldSelectShuffle(ShuffleVectorInst &Shuf) { return I; BinaryOperator *B0, *B1; - if (!match(Shuf.getOperand(0), m_BinOp(B0)) || - !match(Shuf.getOperand(1), m_BinOp(B1))) + if (!match(Op0, m_BinOp(B0)) || !match(Op1, m_BinOp(B1))) return nullptr; // If one operand is "0 - X", allow that to be viewed as "X * -1" @@ -2791,6 +2792,16 @@ Instruction *InstCombinerImpl::simplifyBinOpSplats(ShuffleVectorInst &SVI) { Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) { Value *LHS = SVI.getOperand(0); Value *RHS = SVI.getOperand(1); + + unsigned LHSComplexity = getComplexity(LHS); + unsigned RHSComplexity = getComplexity(RHS); + // Order operands from most complex to least complex so for example + // constants or poison end up on RHS. + if (LHSComplexity < RHSComplexity) { + SVI.commute(); + return &SVI; + } + SimplifyQuery ShufQuery = SQ.getWithInstruction(&SVI); if (auto *V = simplifyShuffleVectorInst(LHS, RHS, SVI.getShuffleMask(), SVI.getType(), ShufQuery)) @@ -2858,12 +2869,6 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) { return new ShuffleVectorInst(LHS, createUnaryMask(Mask, LHSWidth)); } - // shuffle undef, x, mask --> shuffle x, undef, mask' - if (match(LHS, m_Undef())) { - SVI.commute(); - return &SVI; - } - if (Instruction *I = canonicalizeInsertSplat(SVI, Builder)) return I; diff --git a/llvm/test/Analysis/ValueTracking/knownbits-x86-hadd-hsub.ll b/llvm/test/Analysis/ValueTracking/knownbits-x86-hadd-hsub.ll index e2fe873d715cd..0576aef3fe4c3 100644 --- a/llvm/test/Analysis/ValueTracking/knownbits-x86-hadd-hsub.ll +++ b/llvm/test/Analysis/ValueTracking/knownbits-x86-hadd-hsub.ll @@ -240,7 +240,7 @@ define <4 x i1> @hadd_shuffle_4th_negative_v4i32(<4 x i32> %x, <4 x i32> %y) { ; CHECK-NEXT: [[TMP0:%.*]] = and <4 x i32> [[X]], ; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[Y]], ; CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> , <4 x i32> ; CHECK-NEXT: [[RET:%.*]] = icmp ne <4 x i32> [[TMP3]], ; CHECK-NEXT: ret <4 x i1> [[RET]] ; diff --git a/llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll b/llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll index 6cbb2a246f5a4..e4206176b78f8 100644 --- a/llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll @@ -87,7 +87,7 @@ define <8 x float> @widen_extract4(<8 x float> %ins, <2 x float> %ext) { define <8 x i16> @pr26015(<4 x i16> %t0) { ; CHECK-LABEL: @pr26015( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[T0:%.*]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[T5:%.*]] = shufflevector <8 x i16> , <8 x i16> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[T5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> , <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[T5]] ; %t1 = extractelement <4 x i16> %t0, i32 2 @@ -267,7 +267,7 @@ define <4 x i32> @extractelt_insertion(<2 x i32> %x, i32 %y) { ; CHECK-LABEL: @extractelt_insertion( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[B:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP0]], <4 x i32> +; CHECK-NEXT: [[B:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> , <4 x i32> ; CHECK-NEXT: [[C:%.*]] = add i32 [[Y:%.*]], 3 ; CHECK-NEXT: [[D:%.*]] = extractelement <4 x i32> [[TMP0]], i32 [[C]] ; CHECK-NEXT: [[E:%.*]] = icmp eq i32 [[D]], 0 diff --git a/llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll b/llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll index c87e2e8596c62..61f14060406b8 100644 --- a/llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll +++ b/llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll @@ -87,7 +87,7 @@ define <8 x float> @widen_extract4(<8 x float> %ins, <2 x float> %ext) { define <8 x i16> @pr26015(<4 x i16> %t0) { ; CHECK-LABEL: @pr26015( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[T0:%.*]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[T5:%.*]] = shufflevector <8 x i16> , <8 x i16> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[T5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> , <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[T5]] ; %t1 = extractelement <4 x i16> %t0, i32 2 @@ -267,7 +267,7 @@ define <4 x i32> @extractelt_insertion(<2 x i32> %x, i32 %y) { ; CHECK-LABEL: @extractelt_insertion( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[B:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP0]], <4 x i32> +; CHECK-NEXT: [[B:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> , <4 x i32> ; CHECK-NEXT: [[C:%.*]] = add i32 [[Y:%.*]], 3 ; CHECK-NEXT: [[D:%.*]] = extractelement <4 x i32> [[TMP0]], i32 [[C]] ; CHECK-NEXT: [[E:%.*]] = icmp eq i32 [[D]], 0 @@ -795,7 +795,7 @@ define <4 x i32> @infloop_D151807(<4 x float> %arg) { ; CHECK-NEXT: [[I:%.*]] = shufflevector <4 x float> [[ARG:%.*]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: [[I1:%.*]] = bitcast <2 x float> [[I]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[I1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[I4:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[I4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> , <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[I4]] ; %i = shufflevector <4 x float> %arg, <4 x float> poison, <2 x i32> diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll index 0f233fbb4729e..588074779a4d6 100644 --- a/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll @@ -1346,7 +1346,7 @@ define <2 x float> @frem_splat_constant1(<2 x float> %x) { define <2 x i1> @PR40734(<1 x i1> %x, <4 x i1> %y) { ; CHECK-LABEL: @PR40734( -; CHECK-NEXT: [[WIDEN:%.*]] = shufflevector <1 x i1> zeroinitializer, <1 x i1> [[X:%.*]], <2 x i32> +; CHECK-NEXT: [[WIDEN:%.*]] = shufflevector <1 x i1> [[X:%.*]], <1 x i1> zeroinitializer, <2 x i32> ; CHECK-NEXT: [[NARROW:%.*]] = shufflevector <4 x i1> [[Y:%.*]], <4 x i1> poison, <2 x i32> ; CHECK-NEXT: [[R:%.*]] = and <2 x i1> [[WIDEN]], [[NARROW]] ; CHECK-NEXT: ret <2 x i1> [[R]] diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle.ll b/llvm/test/Transforms/InstCombine/vec_shuffle.ll index 75a84e51279b8..5c5ecfa49e7ab 100644 --- a/llvm/test/Transforms/InstCombine/vec_shuffle.ll +++ b/llvm/test/Transforms/InstCombine/vec_shuffle.ll @@ -1351,7 +1351,7 @@ define <2 x float> @frem_splat_constant1(<2 x float> %x) { define <2 x i1> @PR40734(<1 x i1> %x, <4 x i1> %y) { ; CHECK-LABEL: @PR40734( -; CHECK-NEXT: [[WIDEN:%.*]] = shufflevector <1 x i1> zeroinitializer, <1 x i1> [[X:%.*]], <2 x i32> +; CHECK-NEXT: [[WIDEN:%.*]] = shufflevector <1 x i1> [[X:%.*]], <1 x i1> zeroinitializer, <2 x i32> ; CHECK-NEXT: [[NARROW:%.*]] = shufflevector <4 x i1> [[Y:%.*]], <4 x i1> poison, <2 x i32> ; CHECK-NEXT: [[R:%.*]] = and <2 x i1> [[WIDEN]], [[NARROW]] ; CHECK-NEXT: ret <2 x i1> [[R]] @@ -2335,7 +2335,7 @@ define <2 x float> @uitofp_shuf_narrow(<4 x i32> %x, <4 x i32> %y) { define <4 x i16> @blend_elements_from_load(ptr align 8 %_0) { ; CHECK-LABEL: @blend_elements_from_load( ; CHECK-NEXT: [[LOAD:%.*]] = load <3 x i16>, ptr [[_0:%.*]], align 8 -; CHECK-NEXT: [[RV:%.*]] = shufflevector <3 x i16> , <3 x i16> [[LOAD]], <4 x i32> +; CHECK-NEXT: [[RV:%.*]] = shufflevector <3 x i16> [[LOAD]], <3 x i16> , <4 x i32> ; CHECK-NEXT: ret <4 x i16> [[RV]] ; %load = load <3 x i16>, ptr %_0, align 8 @@ -2377,3 +2377,42 @@ define <2 x i32> @not_splat_shuffle2(i32 %x) { %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <2 x i32> ret <2 x i32> %shuf } + +define <2 x i32> @commutative0(<2 x i32> %x) { +; CHECK-LABEL: @commutative0( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[SHUF]] +; + %shuf = shufflevector <2 x i32> poison, <2 x i32> %x, <2 x i32> + ret <2 x i32> %shuf +} + +define <2 x i32> @commutative1(<2 x i32> %x) { +; CHECK-LABEL: @commutative1( +; CHECK-NEXT: [[SHUF1:%.*]] = insertelement <2 x i32> [[X:%.*]], i32 undef, i64 0 +; CHECK-NEXT: ret <2 x i32> [[SHUF1]] +; + %shuf = shufflevector <2 x i32> undef, <2 x i32> %x, <2 x i32> + ret <2 x i32> %shuf +} + +define <4 x i32> @commutative2(<4 x i32> %x) { +; CHECK-LABEL: @commutative2( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> , <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[SHUF]] +; + %shuf = shufflevector <4 x i32> , <4 x i32> %x, <4 x i32> + ret <4 x i32> %shuf +} + +define <2 x i32> @commutative3(<2 x i32> %x, <2 x i16> %y) { +; CHECK-LABEL: @commutative3( +; CHECK-NEXT: [[ZX:%.*]] = zext <2 x i16> [[Y:%.*]] to <2 x i32> +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> [[ZX]], <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[SHUF]] +; + + %zx = zext <2 x i16> %y to <2 x i32> + %shuf = shufflevector <2 x i32> %zx, <2 x i32> %x, <2 x i32> + ret <2 x i32> %shuf +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll index e24c52ba81ddf..3378cb40ed7b3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll @@ -78,7 +78,7 @@ define <8 x float> @fneg_fabs(<8 x float> %a) { ; CHECK-LABEL: @fneg_fabs( ; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) -; CHECK-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP1]], <8 x i32> ; CHECK-NEXT: ret <8 x float> [[DOTUNCASTED]] ; %a0 = extractelement <8 x float> %a, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll index 0f8751a6da7f5..e6ab9432be013 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll @@ -78,7 +78,7 @@ define <8 x float> @fneg_fabs(<8 x float> %a) { ; CHECK-LABEL: @fneg_fabs( ; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]]) -; CHECK-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP1]], <8 x i32> ; CHECK-NEXT: ret <8 x float> [[DOTUNCASTED]] ; %a0 = extractelement <8 x float> %a, i32 0