Skip to content

Commit e37dfa0

Browse files
RKSimonaugusto2112
authored andcommitted
[X86] combineConcatVectorOps - add handling to concat fp rounding intrinsics together (llvm#170160)
1 parent fe14583 commit e37dfa0

File tree

7 files changed

+340
-216
lines changed

7 files changed

+340
-216
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59459,6 +59459,11 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5945959459
}
5946059460
break;
5946159461
case ISD::FSQRT:
59462+
case ISD::FCEIL:
59463+
case ISD::FTRUNC:
59464+
case ISD::FRINT:
59465+
case ISD::FNEARBYINT:
59466+
case ISD::FROUNDEVEN:
5946259467
if (!IsSplat && (VT.is256BitVector() ||
5946359468
(VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
5946459469
return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
@@ -59470,6 +59475,17 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5947059475
return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
5947159476
}
5947259477
break;
59478+
case X86ISD::VRNDSCALE:
59479+
if (!IsSplat &&
59480+
(VT.is256BitVector() ||
59481+
(VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59482+
llvm::all_of(Ops, [Op0](SDValue Op) {
59483+
return Op0.getOperand(1) == Op.getOperand(1);
59484+
})) {
59485+
return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59486+
Op0.getOperand(1));
59487+
}
59488+
break;
5947359489
case X86ISD::HADD:
5947459490
case X86ISD::HSUB:
5947559491
case X86ISD::FHADD:

llvm/test/CodeGen/X86/combine-fceil.ll

Lines changed: 54 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
3-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
4-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
3+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
4+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
55
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
66

77
define <4 x double> @concat_ceil_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
@@ -13,9 +13,9 @@ define <4 x double> @concat_ceil_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1)
1313
;
1414
; AVX-LABEL: concat_ceil_v4f64_v2f64:
1515
; AVX: # %bb.0:
16-
; AVX-NEXT: vroundpd $10, %xmm0, %xmm0
17-
; AVX-NEXT: vroundpd $10, %xmm1, %xmm1
16+
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1817
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
18+
; AVX-NEXT: vroundpd $10, %ymm0, %ymm0
1919
; AVX-NEXT: retq
2020
%v0 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a0)
2121
%v1 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a1)
@@ -32,9 +32,9 @@ define <8 x float> @concat_ceil_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
3232
;
3333
; AVX-LABEL: concat_ceil_v8f32_v4f32:
3434
; AVX: # %bb.0:
35-
; AVX-NEXT: vroundps $10, %xmm0, %xmm0
36-
; AVX-NEXT: vroundps $10, %xmm1, %xmm1
35+
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
3736
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
37+
; AVX-NEXT: vroundps $10, %ymm0, %ymm0
3838
; AVX-NEXT: retq
3939
%v0 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a0)
4040
%v1 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a1)
@@ -51,25 +51,34 @@ define <8 x double> @concat_ceil_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1,
5151
; SSE-NEXT: roundpd $10, %xmm3, %xmm3
5252
; SSE-NEXT: retq
5353
;
54-
; AVX1OR2-LABEL: concat_ceil_v8f64_v2f64:
55-
; AVX1OR2: # %bb.0:
56-
; AVX1OR2-NEXT: vroundpd $10, %xmm0, %xmm0
57-
; AVX1OR2-NEXT: vroundpd $10, %xmm1, %xmm1
58-
; AVX1OR2-NEXT: vroundpd $10, %xmm2, %xmm2
59-
; AVX1OR2-NEXT: vroundpd $10, %xmm3, %xmm3
60-
; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
61-
; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
62-
; AVX1OR2-NEXT: retq
54+
; AVX1-LABEL: concat_ceil_v8f64_v2f64:
55+
; AVX1: # %bb.0:
56+
; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
57+
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
58+
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
59+
; AVX1-NEXT: vroundpd $10, %ymm0, %ymm0
60+
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
61+
; AVX1-NEXT: vroundpd $10, %ymm1, %ymm1
62+
; AVX1-NEXT: retq
63+
;
64+
; AVX2-LABEL: concat_ceil_v8f64_v2f64:
65+
; AVX2: # %bb.0:
66+
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
67+
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
68+
; AVX2-NEXT: vroundpd $10, %ymm0, %ymm0
69+
; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
70+
; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
71+
; AVX2-NEXT: vroundpd $10, %ymm1, %ymm1
72+
; AVX2-NEXT: retq
6373
;
6474
; AVX512-LABEL: concat_ceil_v8f64_v2f64:
6575
; AVX512: # %bb.0:
66-
; AVX512-NEXT: vroundpd $10, %xmm0, %xmm0
67-
; AVX512-NEXT: vroundpd $10, %xmm1, %xmm1
68-
; AVX512-NEXT: vroundpd $10, %xmm2, %xmm2
69-
; AVX512-NEXT: vroundpd $10, %xmm3, %xmm3
76+
; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
77+
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
7078
; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
7179
; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
7280
; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
81+
; AVX512-NEXT: vrndscalepd $10, %zmm0, %zmm0
7382
; AVX512-NEXT: retq
7483
%v0 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a0)
7584
%v1 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a1)
@@ -90,25 +99,34 @@ define <16 x float> @concat_ceil_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1,
9099
; SSE-NEXT: roundps $10, %xmm3, %xmm3
91100
; SSE-NEXT: retq
92101
;
93-
; AVX1OR2-LABEL: concat_ceil_v16f32_v4f32:
94-
; AVX1OR2: # %bb.0:
95-
; AVX1OR2-NEXT: vroundps $10, %xmm0, %xmm0
96-
; AVX1OR2-NEXT: vroundps $10, %xmm1, %xmm1
97-
; AVX1OR2-NEXT: vroundps $10, %xmm2, %xmm2
98-
; AVX1OR2-NEXT: vroundps $10, %xmm3, %xmm3
99-
; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
100-
; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
101-
; AVX1OR2-NEXT: retq
102+
; AVX1-LABEL: concat_ceil_v16f32_v4f32:
103+
; AVX1: # %bb.0:
104+
; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
105+
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
106+
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
107+
; AVX1-NEXT: vroundps $10, %ymm0, %ymm0
108+
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
109+
; AVX1-NEXT: vroundps $10, %ymm1, %ymm1
110+
; AVX1-NEXT: retq
111+
;
112+
; AVX2-LABEL: concat_ceil_v16f32_v4f32:
113+
; AVX2: # %bb.0:
114+
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
115+
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
116+
; AVX2-NEXT: vroundps $10, %ymm0, %ymm0
117+
; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
118+
; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
119+
; AVX2-NEXT: vroundps $10, %ymm1, %ymm1
120+
; AVX2-NEXT: retq
102121
;
103122
; AVX512-LABEL: concat_ceil_v16f32_v4f32:
104123
; AVX512: # %bb.0:
105-
; AVX512-NEXT: vroundps $10, %xmm0, %xmm0
106-
; AVX512-NEXT: vroundps $10, %xmm1, %xmm1
107-
; AVX512-NEXT: vroundps $10, %xmm2, %xmm2
108-
; AVX512-NEXT: vroundps $10, %xmm3, %xmm3
124+
; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
125+
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
109126
; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
110127
; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
111128
; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
129+
; AVX512-NEXT: vrndscaleps $10, %zmm0, %zmm0
112130
; AVX512-NEXT: retq
113131
%v0 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a0)
114132
%v1 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a1)
@@ -137,9 +155,9 @@ define <8 x double> @concat_ceil_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1)
137155
;
138156
; AVX512-LABEL: concat_ceil_v8f64_v4f64:
139157
; AVX512: # %bb.0:
140-
; AVX512-NEXT: vroundpd $10, %ymm0, %ymm0
141-
; AVX512-NEXT: vroundpd $10, %ymm1, %ymm1
158+
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
142159
; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
160+
; AVX512-NEXT: vrndscalepd $10, %zmm0, %zmm0
143161
; AVX512-NEXT: retq
144162
%v0 = call <4 x double> @llvm.ceil.v4f64(<4 x double> %a0)
145163
%v1 = call <4 x double> @llvm.ceil.v4f64(<4 x double> %a1)
@@ -164,9 +182,9 @@ define <16 x float> @concat_ceil_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1)
164182
;
165183
; AVX512-LABEL: concat_ceil_v16f32_v8f32:
166184
; AVX512: # %bb.0:
167-
; AVX512-NEXT: vroundps $10, %ymm0, %ymm0
168-
; AVX512-NEXT: vroundps $10, %ymm1, %ymm1
185+
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
169186
; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
187+
; AVX512-NEXT: vrndscaleps $10, %zmm0, %zmm0
170188
; AVX512-NEXT: retq
171189
%v0 = call <8 x float> @llvm.ceil.v8f32(<8 x float> %a0)
172190
%v1 = call <8 x float> @llvm.ceil.v8f32(<8 x float> %a1)

llvm/test/CodeGen/X86/combine-fnearbyint.ll

Lines changed: 54 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
3-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
4-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
3+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
4+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
55
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
66

77
define <4 x double> @concat_nearbyint_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
@@ -13,9 +13,9 @@ define <4 x double> @concat_nearbyint_v4f64_v2f64(<2 x double> %a0, <2 x double>
1313
;
1414
; AVX-LABEL: concat_nearbyint_v4f64_v2f64:
1515
; AVX: # %bb.0:
16-
; AVX-NEXT: vroundpd $12, %xmm0, %xmm0
17-
; AVX-NEXT: vroundpd $12, %xmm1, %xmm1
16+
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1817
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
18+
; AVX-NEXT: vroundpd $12, %ymm0, %ymm0
1919
; AVX-NEXT: retq
2020
%v0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a0)
2121
%v1 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a1)
@@ -32,9 +32,9 @@ define <8 x float> @concat_nearbyint_v8f32_v4f32(<4 x float> %a0, <4 x float> %a
3232
;
3333
; AVX-LABEL: concat_nearbyint_v8f32_v4f32:
3434
; AVX: # %bb.0:
35-
; AVX-NEXT: vroundps $12, %xmm0, %xmm0
36-
; AVX-NEXT: vroundps $12, %xmm1, %xmm1
35+
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
3736
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
37+
; AVX-NEXT: vroundps $12, %ymm0, %ymm0
3838
; AVX-NEXT: retq
3939
%v0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a0)
4040
%v1 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a1)
@@ -51,25 +51,34 @@ define <8 x double> @concat_nearbyint_v8f64_v2f64(<2 x double> %a0, <2 x double>
5151
; SSE-NEXT: roundpd $12, %xmm3, %xmm3
5252
; SSE-NEXT: retq
5353
;
54-
; AVX1OR2-LABEL: concat_nearbyint_v8f64_v2f64:
55-
; AVX1OR2: # %bb.0:
56-
; AVX1OR2-NEXT: vroundpd $12, %xmm0, %xmm0
57-
; AVX1OR2-NEXT: vroundpd $12, %xmm1, %xmm1
58-
; AVX1OR2-NEXT: vroundpd $12, %xmm2, %xmm2
59-
; AVX1OR2-NEXT: vroundpd $12, %xmm3, %xmm3
60-
; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
61-
; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
62-
; AVX1OR2-NEXT: retq
54+
; AVX1-LABEL: concat_nearbyint_v8f64_v2f64:
55+
; AVX1: # %bb.0:
56+
; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
57+
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
58+
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
59+
; AVX1-NEXT: vroundpd $12, %ymm0, %ymm0
60+
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
61+
; AVX1-NEXT: vroundpd $12, %ymm1, %ymm1
62+
; AVX1-NEXT: retq
63+
;
64+
; AVX2-LABEL: concat_nearbyint_v8f64_v2f64:
65+
; AVX2: # %bb.0:
66+
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
67+
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
68+
; AVX2-NEXT: vroundpd $12, %ymm0, %ymm0
69+
; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
70+
; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
71+
; AVX2-NEXT: vroundpd $12, %ymm1, %ymm1
72+
; AVX2-NEXT: retq
6373
;
6474
; AVX512-LABEL: concat_nearbyint_v8f64_v2f64:
6575
; AVX512: # %bb.0:
66-
; AVX512-NEXT: vroundpd $12, %xmm0, %xmm0
67-
; AVX512-NEXT: vroundpd $12, %xmm1, %xmm1
68-
; AVX512-NEXT: vroundpd $12, %xmm2, %xmm2
69-
; AVX512-NEXT: vroundpd $12, %xmm3, %xmm3
76+
; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
77+
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
7078
; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
7179
; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
7280
; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
81+
; AVX512-NEXT: vrndscalepd $12, %zmm0, %zmm0
7382
; AVX512-NEXT: retq
7483
%v0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a0)
7584
%v1 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a1)
@@ -90,25 +99,34 @@ define <16 x float> @concat_nearbyint_v16f32_v4f32(<4 x float> %a0, <4 x float>
9099
; SSE-NEXT: roundps $12, %xmm3, %xmm3
91100
; SSE-NEXT: retq
92101
;
93-
; AVX1OR2-LABEL: concat_nearbyint_v16f32_v4f32:
94-
; AVX1OR2: # %bb.0:
95-
; AVX1OR2-NEXT: vroundps $12, %xmm0, %xmm0
96-
; AVX1OR2-NEXT: vroundps $12, %xmm1, %xmm1
97-
; AVX1OR2-NEXT: vroundps $12, %xmm2, %xmm2
98-
; AVX1OR2-NEXT: vroundps $12, %xmm3, %xmm3
99-
; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
100-
; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
101-
; AVX1OR2-NEXT: retq
102+
; AVX1-LABEL: concat_nearbyint_v16f32_v4f32:
103+
; AVX1: # %bb.0:
104+
; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
105+
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
106+
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
107+
; AVX1-NEXT: vroundps $12, %ymm0, %ymm0
108+
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
109+
; AVX1-NEXT: vroundps $12, %ymm1, %ymm1
110+
; AVX1-NEXT: retq
111+
;
112+
; AVX2-LABEL: concat_nearbyint_v16f32_v4f32:
113+
; AVX2: # %bb.0:
114+
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
115+
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
116+
; AVX2-NEXT: vroundps $12, %ymm0, %ymm0
117+
; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
118+
; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
119+
; AVX2-NEXT: vroundps $12, %ymm1, %ymm1
120+
; AVX2-NEXT: retq
102121
;
103122
; AVX512-LABEL: concat_nearbyint_v16f32_v4f32:
104123
; AVX512: # %bb.0:
105-
; AVX512-NEXT: vroundps $12, %xmm0, %xmm0
106-
; AVX512-NEXT: vroundps $12, %xmm1, %xmm1
107-
; AVX512-NEXT: vroundps $12, %xmm2, %xmm2
108-
; AVX512-NEXT: vroundps $12, %xmm3, %xmm3
124+
; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
125+
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
109126
; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
110127
; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
111128
; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
129+
; AVX512-NEXT: vrndscaleps $12, %zmm0, %zmm0
112130
; AVX512-NEXT: retq
113131
%v0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a0)
114132
%v1 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a1)
@@ -137,9 +155,9 @@ define <8 x double> @concat_nearbyint_v8f64_v4f64(<4 x double> %a0, <4 x double>
137155
;
138156
; AVX512-LABEL: concat_nearbyint_v8f64_v4f64:
139157
; AVX512: # %bb.0:
140-
; AVX512-NEXT: vroundpd $12, %ymm0, %ymm0
141-
; AVX512-NEXT: vroundpd $12, %ymm1, %ymm1
158+
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
142159
; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
160+
; AVX512-NEXT: vrndscalepd $12, %zmm0, %zmm0
143161
; AVX512-NEXT: retq
144162
%v0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %a0)
145163
%v1 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %a1)
@@ -164,9 +182,9 @@ define <16 x float> @concat_nearbyint_v16f32_v8f32(<8 x float> %a0, <8 x float>
164182
;
165183
; AVX512-LABEL: concat_nearbyint_v16f32_v8f32:
166184
; AVX512: # %bb.0:
167-
; AVX512-NEXT: vroundps $12, %ymm0, %ymm0
168-
; AVX512-NEXT: vroundps $12, %ymm1, %ymm1
185+
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
169186
; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
187+
; AVX512-NEXT: vrndscaleps $12, %zmm0, %zmm0
170188
; AVX512-NEXT: retq
171189
%v0 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %a0)
172190
%v1 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %a1)

0 commit comments

Comments
 (0)