Skip to content

Commit 09b828a

Browse files
committed
[X86] combineConcatVectorOps - add handling to concat sqrt intrinsics together
Similar to fdiv, we should be trying to concat these high latency instructions together
1 parent 130746a commit 09b828a

File tree

2 files changed

+17
-12
lines changed

2 files changed

+17
-12
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59458,6 +59458,12 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5945859458
ConcatSubOperand(VT, Ops, 1));
5945959459
}
5946059460
break;
59461+
case ISD::FSQRT:
59462+
if (!IsSplat && (VT.is256BitVector() ||
59463+
(VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59464+
return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
59465+
}
59466+
break;
5946159467
case X86ISD::HADD:
5946259468
case X86ISD::HSUB:
5946359469
case X86ISD::FHADD:

llvm/test/CodeGen/X86/combine-fsqrt.ll

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ define <8 x float> @concat_sqrt_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
1414
;
1515
; AVX-LABEL: concat_sqrt_v8f32_v4f32:
1616
; AVX: # %bb.0:
17-
; AVX-NEXT: vsqrtps %xmm0, %xmm0
18-
; AVX-NEXT: vsqrtps %xmm1, %xmm1
17+
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1918
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
19+
; AVX-NEXT: vsqrtps %ymm0, %ymm0
2020
; AVX-NEXT: retq
2121
%v0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
2222
%v1 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a1)
@@ -35,23 +35,22 @@ define <16 x float> @concat_sqrt_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1,
3535
;
3636
; AVX1OR2-LABEL: concat_sqrt_v16f32_v4f32:
3737
; AVX1OR2: # %bb.0:
38-
; AVX1OR2-NEXT: vsqrtps %xmm0, %xmm0
39-
; AVX1OR2-NEXT: vsqrtps %xmm1, %xmm1
40-
; AVX1OR2-NEXT: vsqrtps %xmm2, %xmm2
41-
; AVX1OR2-NEXT: vsqrtps %xmm3, %xmm3
38+
; AVX1OR2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
39+
; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
4240
; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
41+
; AVX1OR2-NEXT: vsqrtps %ymm0, %ymm0
4342
; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
43+
; AVX1OR2-NEXT: vsqrtps %ymm1, %ymm1
4444
; AVX1OR2-NEXT: retq
4545
;
4646
; AVX512-LABEL: concat_sqrt_v16f32_v4f32:
4747
; AVX512: # %bb.0:
48-
; AVX512-NEXT: vsqrtps %xmm0, %xmm0
49-
; AVX512-NEXT: vsqrtps %xmm1, %xmm1
50-
; AVX512-NEXT: vsqrtps %xmm2, %xmm2
51-
; AVX512-NEXT: vsqrtps %xmm3, %xmm3
48+
; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
49+
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
5250
; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
5351
; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
5452
; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
53+
; AVX512-NEXT: vsqrtps %zmm0, %zmm0
5554
; AVX512-NEXT: retq
5655
%v0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
5756
%v1 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a1)
@@ -80,9 +79,9 @@ define <16 x float> @concat_sqrt_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1)
8079
;
8180
; AVX512-LABEL: concat_sqrt_v16f32_v8f32:
8281
; AVX512: # %bb.0:
83-
; AVX512-NEXT: vsqrtps %ymm0, %ymm0
84-
; AVX512-NEXT: vsqrtps %ymm1, %ymm1
82+
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
8583
; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
84+
; AVX512-NEXT: vsqrtps %zmm0, %zmm0
8685
; AVX512-NEXT: retq
8786
%v0 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0)
8887
%v1 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a1)

0 commit comments

Comments
 (0)