Skip to content

Commit 2d2c95a

Browse files
committed
[AArch64] Add tablegen patterns for fmla index with extract 0.
We have tablegen patterns to produce an indexed `fmla s0, s1, v2.s[2]` from fma extract(Rn, lane), Rm, Ra -> fmla But for the case of lane==0, we want to prefer the simple `fmadd s0, s1, s2. So we have patterns for fma extract(Rn, 0), Rm, Ra -> fmadd The problem arises when we have two extracts, as tablegen starts to prefer the second pattern, as it looks more specialized. This patch adds addition patterns to catch this case: fma extract(Rn, index), extract(Rm, 0), Ra -> fmla To make sure the simpler fmadd keeps being used when both lanes are extracted from lane 0 we need to add patterns for that case too: fma extract(Rn, 0), extract(Rm, 0), Ra -> fmadd
1 parent 1e9d068 commit 2d2c95a

File tree

4 files changed

+50
-24
lines changed

4 files changed

+50
-24
lines changed

llvm/lib/Target/AArch64/AArch64InstrFormats.td

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5821,6 +5821,13 @@ multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
58215821
(f16 FPR16:$Ra))),
58225822
(!cast<Instruction>(NAME # Hrrr)
58235823
(f16 (EXTRACT_SUBREG V128:$Rn, hsub)), FPR16:$Rm, FPR16:$Ra)>;
5824+
5825+
def : Pat<(f16 (node (f16 (extractelt (v8f16 V128:$Rn), (i64 0))),
5826+
(f16 (extractelt (v8f16 V128:$Rm), (i64 0))),
5827+
(f16 FPR16:$Ra))),
5828+
(!cast<Instruction>(NAME # Hrrr)
5829+
(f16 (EXTRACT_SUBREG V128:$Rn, hsub)),
5830+
(f16 (EXTRACT_SUBREG V128:$Rm, hsub)), FPR16:$Ra)>;
58245831
}
58255832

58265833
def : Pat<(f32 (node (f32 FPR32:$Rn),
@@ -5835,6 +5842,13 @@ multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
58355842
(!cast<Instruction>(NAME # Srrr)
58365843
(EXTRACT_SUBREG V128:$Rn, ssub), FPR32:$Rm, FPR32:$Ra)>;
58375844

5845+
def : Pat<(f32 (node (f32 (extractelt (v4f32 V128:$Rn), (i64 0))),
5846+
(f32 (extractelt (v4f32 V128:$Rm), (i64 0))),
5847+
(f32 FPR32:$Ra))),
5848+
(!cast<Instruction>(NAME # Srrr)
5849+
(EXTRACT_SUBREG V128:$Rn, ssub),
5850+
(EXTRACT_SUBREG V128:$Rm, ssub), FPR32:$Ra)>;
5851+
58385852
def : Pat<(f64 (node (f64 FPR64:$Rn),
58395853
(f64 (extractelt (v2f64 V128:$Rm), (i64 0))),
58405854
(f64 FPR64:$Ra))),
@@ -5846,6 +5860,13 @@ multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
58465860
(f64 FPR64:$Ra))),
58475861
(!cast<Instruction>(NAME # Drrr)
58485862
(EXTRACT_SUBREG V128:$Rn, dsub), FPR64:$Rm, FPR64:$Ra)>;
5863+
5864+
def : Pat<(f64 (node (f64 (extractelt (v2f64 V128:$Rn), (i64 0))),
5865+
(f64 (extractelt (v2f64 V128:$Rm), (i64 0))),
5866+
(f64 FPR64:$Ra))),
5867+
(!cast<Instruction>(NAME # Drrr)
5868+
(EXTRACT_SUBREG V128:$Rn, dsub),
5869+
(EXTRACT_SUBREG V128:$Rm, dsub), FPR64:$Ra)>;
58495870
}
58505871

58515872
//---
@@ -9282,6 +9303,11 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
92829303
(vector_extract (v8f16 V128_lo:$Rm), VectorIndexH:$idx))),
92839304
(!cast<Instruction>(INST # "v1i16_indexed") FPR16:$Rd, FPR16:$Rn,
92849305
V128_lo:$Rm, VectorIndexH:$idx)>;
9306+
def : Pat<(f16 (OpNode (f16 FPR16:$Rd),
9307+
(vector_extract (v8f16 V128:$Rn), (i64 0)),
9308+
(vector_extract (v8f16 V128_lo:$Rm), VectorIndexH:$idx))),
9309+
(!cast<Instruction>(INST # "v1i16_indexed") FPR16:$Rd,
9310+
(f16 (EXTRACT_SUBREG V128:$Rn, hsub)), V128_lo:$Rm, VectorIndexH:$idx)>;
92859311
} // Predicates = [HasNEON, HasFullFP16]
92869312

92879313
// 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
@@ -9323,12 +9349,22 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
93239349
(vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
93249350
(!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
93259351
V128:$Rm, VectorIndexS:$idx)>;
9352+
def : Pat<(f32 (OpNode (f32 FPR32:$Rd),
9353+
(vector_extract (v4f32 V128:$Rn), (i64 0)),
9354+
(vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
9355+
(!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd,
9356+
(f32 (EXTRACT_SUBREG V128:$Rn, ssub)), V128:$Rm, VectorIndexS:$idx)>;
93269357

93279358
// 1 variant for 64-bit scalar version: extract from .1d or from .2d
93289359
def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
93299360
(vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
93309361
(!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn,
93319362
V128:$Rm, VectorIndexD:$idx)>;
9363+
def : Pat<(f64 (OpNode (f64 FPR64:$Rd),
9364+
(vector_extract (v2f64 V128:$Rn), (i64 0)),
9365+
(vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
9366+
(!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd,
9367+
(f64 (EXTRACT_SUBREG V128:$Rn, dsub)), V128:$Rm, VectorIndexD:$idx)>;
93329368
}
93339369

93349370
let mayRaiseFPException = 1, Uses = [FPCR] in

llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ define <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) {
1111
; CHECK-NEXT: mov h2, v0.h[1]
1212
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1313
; CHECK-NEXT: fmul h3, h0, v1.h[1]
14-
; CHECK-NEXT: fmul h4, h2, v1.h[1]
15-
; CHECK-NEXT: fmadd h2, h1, h2, h3
16-
; CHECK-NEXT: fnmsub h0, h1, h0, h4
17-
; CHECK-NEXT: mov v0.h[1], v2.h[0]
14+
; CHECK-NEXT: fmul h2, h2, v1.h[1]
15+
; CHECK-NEXT: fmla h3, h1, v0.h[1]
16+
; CHECK-NEXT: fnmsub h0, h1, h0, h2
17+
; CHECK-NEXT: mov v0.h[1], v3.h[0]
1818
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
1919
; CHECK-NEXT: ret
2020
entry:

llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,7 @@ define half @t_vfmah_lane_f16_3_0(half %a, <4 x half> %c) {
120120
; CHECK-LABEL: t_vfmah_lane_f16_3_0:
121121
; CHECK: // %bb.0: // %entry
122122
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
123-
; CHECK-NEXT: mov h2, v1.h[3]
124-
; CHECK-NEXT: fmadd h0, h1, h2, h0
123+
; CHECK-NEXT: fmla h0, h1, v1.h[3]
125124
; CHECK-NEXT: ret
126125
entry:
127126
%b = extractelement <4 x half> %c, i32 0
@@ -310,8 +309,7 @@ define half @t_vfmsh_lane_f16_0_3(half %a, <4 x half> %c, i32 %lane) {
310309
; CHECK-LABEL: t_vfmsh_lane_f16_0_3:
311310
; CHECK: // %bb.0: // %entry
312311
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
313-
; CHECK-NEXT: mov h2, v1.h[3]
314-
; CHECK-NEXT: fmsub h0, h2, h1, h0
312+
; CHECK-NEXT: fmls h0, h1, v1.h[3]
315313
; CHECK-NEXT: ret
316314
entry:
317315
%b = extractelement <4 x half> %c, i32 0

llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,7 @@ define float @test_fmla_ss2S_1(float %a, float %b, <2 x float> %v) {
8484
define float @test_fmla_ss4S_3_ext0(float %a, <4 x float> %v) {
8585
; CHECK-LABEL: test_fmla_ss4S_3_ext0:
8686
; CHECK: // %bb.0:
87-
; CHECK-NEXT: mov s2, v1.s[3]
88-
; CHECK-NEXT: fmadd s0, s1, s2, s0
87+
; CHECK-NEXT: fmla s0, s1, v1.s[3]
8988
; CHECK-NEXT: ret
9089
%tmp0 = extractelement <4 x float> %v, i32 0
9190
%tmp1 = extractelement <4 x float> %v, i32 3
@@ -96,8 +95,7 @@ define float @test_fmla_ss4S_3_ext0(float %a, <4 x float> %v) {
9695
define float @test_fmla_ss4S_3_ext0_swp(float %a, <4 x float> %v) {
9796
; CHECK-LABEL: test_fmla_ss4S_3_ext0_swp:
9897
; CHECK: // %bb.0:
99-
; CHECK-NEXT: mov s2, v1.s[3]
100-
; CHECK-NEXT: fmadd s0, s2, s1, s0
98+
; CHECK-NEXT: fmla s0, s1, v1.s[3]
10199
; CHECK-NEXT: ret
102100
%tmp0 = extractelement <4 x float> %v, i32 0
103101
%tmp1 = extractelement <4 x float> %v, i32 3
@@ -120,8 +118,7 @@ define float @test_fmla_ss2S_3_ext0(float %a, <2 x float> %v) {
120118
; CHECK-LABEL: test_fmla_ss2S_3_ext0:
121119
; CHECK: // %bb.0:
122120
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
123-
; CHECK-NEXT: mov s2, v1.s[1]
124-
; CHECK-NEXT: fmadd s0, s1, s2, s0
121+
; CHECK-NEXT: fmla s0, s1, v1.s[1]
125122
; CHECK-NEXT: ret
126123
%tmp0 = extractelement <2 x float> %v, i32 0
127124
%tmp1 = extractelement <2 x float> %v, i32 1
@@ -133,8 +130,7 @@ define float @test_fmla_ss2S_3_ext0_swp(float %a, <2 x float> %v) {
133130
; CHECK-LABEL: test_fmla_ss2S_3_ext0_swp:
134131
; CHECK: // %bb.0:
135132
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
136-
; CHECK-NEXT: mov s2, v1.s[1]
137-
; CHECK-NEXT: fmadd s0, s2, s1, s0
133+
; CHECK-NEXT: fmla s0, s1, v1.s[1]
138134
; CHECK-NEXT: ret
139135
%tmp0 = extractelement <2 x float> %v, i32 0
140136
%tmp1 = extractelement <2 x float> %v, i32 1
@@ -218,8 +214,7 @@ define double @test_fmla_dd2D_1_swap(double %a, double %b, <2 x double> %v) {
218214
define double @test_fmla_ss2D_1_ext0(double %a, <2 x double> %v) {
219215
; CHECK-LABEL: test_fmla_ss2D_1_ext0:
220216
; CHECK: // %bb.0:
221-
; CHECK-NEXT: mov d2, v1.d[1]
222-
; CHECK-NEXT: fmadd d0, d1, d2, d0
217+
; CHECK-NEXT: fmla d0, d1, v1.d[1]
223218
; CHECK-NEXT: ret
224219
%tmp0 = extractelement <2 x double> %v, i32 0
225220
%tmp1 = extractelement <2 x double> %v, i32 1
@@ -230,8 +225,7 @@ define double @test_fmla_ss2D_1_ext0(double %a, <2 x double> %v) {
230225
define double @test_fmla_ss2D_1_ext0_swp(double %a, <2 x double> %v) {
231226
; CHECK-LABEL: test_fmla_ss2D_1_ext0_swp:
232227
; CHECK: // %bb.0:
233-
; CHECK-NEXT: mov d2, v1.d[1]
234-
; CHECK-NEXT: fmadd d0, d2, d1, d0
228+
; CHECK-NEXT: fmla d0, d1, v1.d[1]
235229
; CHECK-NEXT: ret
236230
%tmp0 = extractelement <2 x double> %v, i32 0
237231
%tmp1 = extractelement <2 x double> %v, i32 1
@@ -340,8 +334,7 @@ define float @test_fmls_ss2S_1(float %a, float %b, <2 x float> %v) {
340334
define float @test_fmls_ss4S_3_ext0(float %a, <4 x float> %v) {
341335
; CHECK-LABEL: test_fmls_ss4S_3_ext0:
342336
; CHECK: // %bb.0:
343-
; CHECK-NEXT: mov s2, v1.s[3]
344-
; CHECK-NEXT: fmsub s0, s1, s2, s0
337+
; CHECK-NEXT: fmls s0, s1, v1.s[3]
345338
; CHECK-NEXT: ret
346339
%tmp0 = extractelement <4 x float> %v, i32 0
347340
%tmp1 = extractelement <4 x float> %v, i32 3
@@ -437,8 +430,7 @@ define double @test_fmls_dd2D_1_swap(double %a, double %b, <2 x double> %v) {
437430
define double @test_fmls_dd2D_1_ext0(double %a, <2 x double> %v) {
438431
; CHECK-LABEL: test_fmls_dd2D_1_ext0:
439432
; CHECK: // %bb.0:
440-
; CHECK-NEXT: mov d2, v1.d[1]
441-
; CHECK-NEXT: fmsub d0, d1, d2, d0
433+
; CHECK-NEXT: fmls d0, d1, v1.d[1]
442434
; CHECK-NEXT: ret
443435
%tmp0 = extractelement <2 x double> %v, i32 0
444436
%tmp1 = extractelement <2 x double> %v, i32 1

0 commit comments

Comments
 (0)