Skip to content

Commit a29d7a1

Browse files
authored
[GlobalISel] fdiv to fmul transform (llvm#144305)
This is a port of the SDAG DAGCombiner::combineRepeatedFPDivisors combine that looks for multiple fdiv operations with the same divisor and converts them to a single reciprocal fdiv and multiple fmuls. It is currently a fairly faithful port, with some additions to make sure that the newly created fdiv dominates all new uses. Compared to the SDAG version it also drops some logic about splat uses which assumes no vector fdivs and some logic about x/sqrt(x) which does not yet apply to GISel.
1 parent a1f9ad2 commit a29d7a1

File tree

4 files changed

+153
-82
lines changed

4 files changed

+153
-82
lines changed

llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -848,6 +848,10 @@ class CombinerHelper {
848848

849849
bool matchCombineFMinMaxNaN(MachineInstr &MI, unsigned &Info) const;
850850

851+
bool matchRepeatedFPDivisor(MachineInstr &MI,
852+
SmallVector<MachineInstr *> &MatchInfo) const;
853+
void applyRepeatedFPDivisor(SmallVector<MachineInstr *> &MatchInfo) const;
854+
851855
/// Transform G_ADD(x, G_SUB(y, x)) to y.
852856
/// Transform G_ADD(G_SUB(y, x), x) to y.
853857
bool matchAddSubSameReg(MachineInstr &MI, Register &Src) const;

llvm/include/llvm/Target/GlobalISel/Combine.td

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,7 @@ def build_fn_matchinfo :
213213
GIDefMatchData<"std::function<void(MachineIRBuilder &)>">;
214214
def unsigned_matchinfo: GIDefMatchData<"unsigned">;
215215
def register_vector_matchinfo : GIDefMatchData<"SmallVector<Register>">;
216+
def mi_vector_matchinfo : GIDefMatchData<"SmallVector<MachineInstr *>">;
216217

217218
def copy_prop : GICombineRule<
218219
(defs root:$d),
@@ -1416,6 +1417,14 @@ def combine_minmax_nan: GICombineRule<
14161417
[{ return Helper.matchCombineFMinMaxNaN(*${root}, ${info}); }]),
14171418
(apply [{ Helper.replaceSingleDefInstWithOperand(*${root}, ${info}); }])>;
14181419

1420+
// Combine multiple FDIVs with the same divisor into multiple FMULs by the
1421+
// reciprocal.
1422+
def fdiv_repeated_divison: GICombineRule<
1423+
(defs root:$root, mi_vector_matchinfo:$matchinfo),
1424+
(match (G_FDIV $dst, $src1, $src2):$root,
1425+
[{ return Helper.matchRepeatedFPDivisor(*${root}, ${matchinfo}); }]),
1426+
(apply [{ Helper.applyRepeatedFPDivisor(${matchinfo}); }])>;
1427+
14191428
// Transform (add x, (sub y, x)) -> y
14201429
// Transform (add (sub y, x), x) -> y
14211430
def add_sub_reg_frags : GICombinePatFrag<
@@ -2139,7 +2148,8 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
21392148
constant_fold_cast_op, fabs_fneg_fold,
21402149
mulh_combines, redundant_neg_operands,
21412150
and_or_disjoint_mask, fma_combines, fold_binop_into_select,
2142-
intrem_combines, intdiv_combines, sub_add_reg, select_to_minmax,
2151+
intrem_combines, intdiv_combines, fdiv_repeated_divison,
2152+
sub_add_reg, select_to_minmax,
21432153
fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
21442154
simplify_neg_minmax, combine_concat_vector,
21452155
sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines,

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6745,6 +6745,73 @@ bool CombinerHelper::matchCombineFMinMaxNaN(MachineInstr &MI,
67456745
return MatchNaN(1) || MatchNaN(2);
67466746
}
67476747

6748+
// Combine multiple FDIVs with the same divisor into multiple FMULs by the
6749+
// reciprocal.
6750+
// E.g., (a / Y; b / Y;) -> (recip = 1.0 / Y; a * recip; b * recip)
6751+
bool CombinerHelper::matchRepeatedFPDivisor(
6752+
MachineInstr &MI, SmallVector<MachineInstr *> &MatchInfo) const {
6753+
assert(MI.getOpcode() == TargetOpcode::G_FDIV);
6754+
6755+
Register X = MI.getOperand(1).getReg();
6756+
Register Y = MI.getOperand(2).getReg();
6757+
6758+
if (!MI.getFlag(MachineInstr::MIFlag::FmArcp))
6759+
return false;
6760+
6761+
// Skip if current node is a reciprocal/fneg-reciprocal.
6762+
auto N0CFP = isConstantOrConstantSplatVectorFP(*MRI.getVRegDef(X), MRI);
6763+
if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0)))
6764+
return false;
6765+
6766+
// Exit early if the target does not want this transform or if there can't
6767+
// possibly be enough uses of the divisor to make the transform worthwhile.
6768+
unsigned MinUses = getTargetLowering().combineRepeatedFPDivisors();
6769+
if (!MinUses)
6770+
return false;
6771+
6772+
// Find all FDIV users of the same divisor. For the moment we limit all
6773+
// instructions to a single BB and use the first Instr in MatchInfo as the
6774+
// dominating position.
6775+
MatchInfo.push_back(&MI);
6776+
for (auto &U : MRI.use_nodbg_instructions(Y)) {
6777+
if (&U == &MI || U.getParent() != MI.getParent())
6778+
continue;
6779+
if (U.getOpcode() == TargetOpcode::G_FDIV &&
6780+
U.getOperand(2).getReg() == Y && U.getOperand(1).getReg() != Y) {
6781+
// This division is eligible for optimization only if global unsafe math
6782+
// is enabled or if this division allows reciprocal formation.
6783+
if (U.getFlag(MachineInstr::MIFlag::FmArcp)) {
6784+
MatchInfo.push_back(&U);
6785+
if (dominates(U, *MatchInfo[0]))
6786+
std::swap(MatchInfo[0], MatchInfo.back());
6787+
}
6788+
}
6789+
}
6790+
6791+
// Now that we have the actual number of divisor uses, make sure it meets
6792+
// the minimum threshold specified by the target.
6793+
return MatchInfo.size() >= MinUses;
6794+
}
6795+
6796+
void CombinerHelper::applyRepeatedFPDivisor(
6797+
SmallVector<MachineInstr *> &MatchInfo) const {
6798+
// Generate the new div at the position of the first instruction, that we have
6799+
// ensured will dominate all other instructions.
6800+
Builder.setInsertPt(*MatchInfo[0]->getParent(), MatchInfo[0]);
6801+
LLT Ty = MRI.getType(MatchInfo[0]->getOperand(0).getReg());
6802+
auto Div = Builder.buildFDiv(Ty, Builder.buildFConstant(Ty, 1.0),
6803+
MatchInfo[0]->getOperand(2).getReg(),
6804+
MatchInfo[0]->getFlags());
6805+
6806+
// Replace all found div's with fmul instructions.
6807+
for (MachineInstr *MI : MatchInfo) {
6808+
Builder.setInsertPt(*MI->getParent(), MI);
6809+
Builder.buildFMul(MI->getOperand(0).getReg(), MI->getOperand(1).getReg(),
6810+
Div->getOperand(0).getReg(), MI->getFlags());
6811+
MI->eraseFromParent();
6812+
}
6813+
}
6814+
67486815
bool CombinerHelper::matchAddSubSameReg(MachineInstr &MI, Register &Src) const {
67496816
assert(MI.getOpcode() == TargetOpcode::G_ADD && "Expected a G_ADD");
67506817
Register LHS = MI.getOperand(1).getReg();

llvm/test/CodeGen/AArch64/fdiv-combine.ll

Lines changed: 71 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -12,22 +12,14 @@
1212
; =>
1313
; recip = 1.0 / D; a * recip; b * recip; c * recip;
1414
define void @three_fdiv_float(float %D, float %a, float %b, float %c) {
15-
; CHECK-SD-LABEL: three_fdiv_float:
16-
; CHECK-SD: // %bb.0:
17-
; CHECK-SD-NEXT: fmov s4, #1.00000000
18-
; CHECK-SD-NEXT: fdiv s4, s4, s0
19-
; CHECK-SD-NEXT: fmul s0, s1, s4
20-
; CHECK-SD-NEXT: fmul s1, s2, s4
21-
; CHECK-SD-NEXT: fmul s2, s3, s4
22-
; CHECK-SD-NEXT: b foo_3f
23-
;
24-
; CHECK-GI-LABEL: three_fdiv_float:
25-
; CHECK-GI: // %bb.0:
26-
; CHECK-GI-NEXT: fdiv s4, s1, s0
27-
; CHECK-GI-NEXT: fdiv s1, s2, s0
28-
; CHECK-GI-NEXT: fdiv s2, s3, s0
29-
; CHECK-GI-NEXT: fmov s0, s4
30-
; CHECK-GI-NEXT: b foo_3f
15+
; CHECK-LABEL: three_fdiv_float:
16+
; CHECK: // %bb.0:
17+
; CHECK-NEXT: fmov s4, #1.00000000
18+
; CHECK-NEXT: fdiv s4, s4, s0
19+
; CHECK-NEXT: fmul s0, s1, s4
20+
; CHECK-NEXT: fmul s1, s2, s4
21+
; CHECK-NEXT: fmul s2, s3, s4
22+
; CHECK-NEXT: b foo_3f
3123
%div = fdiv arcp float %a, %D
3224
%div1 = fdiv arcp float %b, %D
3325
%div2 = fdiv arcp float %c, %D
@@ -36,22 +28,14 @@ define void @three_fdiv_float(float %D, float %a, float %b, float %c) {
3628
}
3729

3830
define void @three_fdiv_double(double %D, double %a, double %b, double %c) {
39-
; CHECK-SD-LABEL: three_fdiv_double:
40-
; CHECK-SD: // %bb.0:
41-
; CHECK-SD-NEXT: fmov d4, #1.00000000
42-
; CHECK-SD-NEXT: fdiv d4, d4, d0
43-
; CHECK-SD-NEXT: fmul d0, d1, d4
44-
; CHECK-SD-NEXT: fmul d1, d2, d4
45-
; CHECK-SD-NEXT: fmul d2, d3, d4
46-
; CHECK-SD-NEXT: b foo_3d
47-
;
48-
; CHECK-GI-LABEL: three_fdiv_double:
49-
; CHECK-GI: // %bb.0:
50-
; CHECK-GI-NEXT: fdiv d4, d1, d0
51-
; CHECK-GI-NEXT: fdiv d1, d2, d0
52-
; CHECK-GI-NEXT: fdiv d2, d3, d0
53-
; CHECK-GI-NEXT: fmov d0, d4
54-
; CHECK-GI-NEXT: b foo_3d
31+
; CHECK-LABEL: three_fdiv_double:
32+
; CHECK: // %bb.0:
33+
; CHECK-NEXT: fmov d4, #1.00000000
34+
; CHECK-NEXT: fdiv d4, d4, d0
35+
; CHECK-NEXT: fmul d0, d1, d4
36+
; CHECK-NEXT: fmul d1, d2, d4
37+
; CHECK-NEXT: fmul d2, d3, d4
38+
; CHECK-NEXT: b foo_3d
5539
%div = fdiv arcp double %a, %D
5640
%div1 = fdiv arcp double %b, %D
5741
%div2 = fdiv arcp double %c, %D
@@ -60,22 +44,14 @@ define void @three_fdiv_double(double %D, double %a, double %b, double %c) {
6044
}
6145

6246
define void @three_fdiv_4xfloat(<4 x float> %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
63-
; CHECK-SD-LABEL: three_fdiv_4xfloat:
64-
; CHECK-SD: // %bb.0:
65-
; CHECK-SD-NEXT: fmov v4.4s, #1.00000000
66-
; CHECK-SD-NEXT: fdiv v4.4s, v4.4s, v0.4s
67-
; CHECK-SD-NEXT: fmul v0.4s, v1.4s, v4.4s
68-
; CHECK-SD-NEXT: fmul v1.4s, v2.4s, v4.4s
69-
; CHECK-SD-NEXT: fmul v2.4s, v3.4s, v4.4s
70-
; CHECK-SD-NEXT: b foo_3_4xf
71-
;
72-
; CHECK-GI-LABEL: three_fdiv_4xfloat:
73-
; CHECK-GI: // %bb.0:
74-
; CHECK-GI-NEXT: fdiv v4.4s, v1.4s, v0.4s
75-
; CHECK-GI-NEXT: fdiv v1.4s, v2.4s, v0.4s
76-
; CHECK-GI-NEXT: fdiv v2.4s, v3.4s, v0.4s
77-
; CHECK-GI-NEXT: mov v0.16b, v4.16b
78-
; CHECK-GI-NEXT: b foo_3_4xf
47+
; CHECK-LABEL: three_fdiv_4xfloat:
48+
; CHECK: // %bb.0:
49+
; CHECK-NEXT: fmov v4.4s, #1.00000000
50+
; CHECK-NEXT: fdiv v4.4s, v4.4s, v0.4s
51+
; CHECK-NEXT: fmul v0.4s, v1.4s, v4.4s
52+
; CHECK-NEXT: fmul v1.4s, v2.4s, v4.4s
53+
; CHECK-NEXT: fmul v2.4s, v3.4s, v4.4s
54+
; CHECK-NEXT: b foo_3_4xf
7955
%div = fdiv arcp <4 x float> %a, %D
8056
%div1 = fdiv arcp <4 x float> %b, %D
8157
%div2 = fdiv arcp <4 x float> %c, %D
@@ -84,22 +60,14 @@ define void @three_fdiv_4xfloat(<4 x float> %D, <4 x float> %a, <4 x float> %b,
8460
}
8561

8662
define void @three_fdiv_2xdouble(<2 x double> %D, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
87-
; CHECK-SD-LABEL: three_fdiv_2xdouble:
88-
; CHECK-SD: // %bb.0:
89-
; CHECK-SD-NEXT: fmov v4.2d, #1.00000000
90-
; CHECK-SD-NEXT: fdiv v4.2d, v4.2d, v0.2d
91-
; CHECK-SD-NEXT: fmul v0.2d, v1.2d, v4.2d
92-
; CHECK-SD-NEXT: fmul v1.2d, v2.2d, v4.2d
93-
; CHECK-SD-NEXT: fmul v2.2d, v3.2d, v4.2d
94-
; CHECK-SD-NEXT: b foo_3_2xd
95-
;
96-
; CHECK-GI-LABEL: three_fdiv_2xdouble:
97-
; CHECK-GI: // %bb.0:
98-
; CHECK-GI-NEXT: fdiv v4.2d, v1.2d, v0.2d
99-
; CHECK-GI-NEXT: fdiv v1.2d, v2.2d, v0.2d
100-
; CHECK-GI-NEXT: fdiv v2.2d, v3.2d, v0.2d
101-
; CHECK-GI-NEXT: mov v0.16b, v4.16b
102-
; CHECK-GI-NEXT: b foo_3_2xd
63+
; CHECK-LABEL: three_fdiv_2xdouble:
64+
; CHECK: // %bb.0:
65+
; CHECK-NEXT: fmov v4.2d, #1.00000000
66+
; CHECK-NEXT: fdiv v4.2d, v4.2d, v0.2d
67+
; CHECK-NEXT: fmul v0.2d, v1.2d, v4.2d
68+
; CHECK-NEXT: fmul v1.2d, v2.2d, v4.2d
69+
; CHECK-NEXT: fmul v2.2d, v3.2d, v4.2d
70+
; CHECK-NEXT: b foo_3_2xd
10371
%div = fdiv arcp <2 x double> %a, %D
10472
%div1 = fdiv arcp <2 x double> %b, %D
10573
%div2 = fdiv arcp <2 x double> %c, %D
@@ -135,26 +103,47 @@ define void @two_fdiv_double(double %D, double %a, double %b) {
135103
ret void
136104
}
137105

138-
define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
139-
; CHECK-SD-LABEL: splat_three_fdiv_4xfloat:
106+
define void @four_fdiv_multi_float(float %D, float %a, float %b, float %c) #0 {
107+
; CHECK-SD-LABEL: four_fdiv_multi_float:
140108
; CHECK-SD: // %bb.0:
141-
; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0
142-
; CHECK-SD-NEXT: fmov v4.4s, #1.00000000
143-
; CHECK-SD-NEXT: dup v0.4s, v0.s[0]
144-
; CHECK-SD-NEXT: fdiv v4.4s, v4.4s, v0.4s
145-
; CHECK-SD-NEXT: fmul v0.4s, v1.4s, v4.4s
146-
; CHECK-SD-NEXT: fmul v1.4s, v2.4s, v4.4s
147-
; CHECK-SD-NEXT: fmul v2.4s, v3.4s, v4.4s
148-
; CHECK-SD-NEXT: b foo_3_4xf
109+
; CHECK-SD-NEXT: fmov s4, #1.00000000
110+
; CHECK-SD-NEXT: fdiv s5, s4, s0
111+
; CHECK-SD-NEXT: fmul s4, s1, s5
112+
; CHECK-SD-NEXT: fmul s1, s2, s5
113+
; CHECK-SD-NEXT: fmul s2, s3, s5
114+
; CHECK-SD-NEXT: fmul s3, s0, s5
115+
; CHECK-SD-NEXT: fmov s0, s4
116+
; CHECK-SD-NEXT: b foo_4f
149117
;
150-
; CHECK-GI-LABEL: splat_three_fdiv_4xfloat:
118+
; CHECK-GI-LABEL: four_fdiv_multi_float:
151119
; CHECK-GI: // %bb.0:
152-
; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
153-
; CHECK-GI-NEXT: dup v4.4s, v0.s[0]
154-
; CHECK-GI-NEXT: fdiv v0.4s, v1.4s, v4.4s
155-
; CHECK-GI-NEXT: fdiv v1.4s, v2.4s, v4.4s
156-
; CHECK-GI-NEXT: fdiv v2.4s, v3.4s, v4.4s
157-
; CHECK-GI-NEXT: b foo_3_4xf
120+
; CHECK-GI-NEXT: fmov s4, #1.00000000
121+
; CHECK-GI-NEXT: fdiv s5, s4, s0
122+
; CHECK-GI-NEXT: fdiv s4, s0, s0
123+
; CHECK-GI-NEXT: fmul s0, s1, s5
124+
; CHECK-GI-NEXT: fmul s1, s2, s5
125+
; CHECK-GI-NEXT: fmul s2, s3, s5
126+
; CHECK-GI-NEXT: fmov s3, s4
127+
; CHECK-GI-NEXT: b foo_4f
128+
%div = fdiv arcp float %a, %D
129+
%div1 = fdiv arcp float %b, %D
130+
%div2 = fdiv arcp float %c, %D
131+
%div3 = fdiv arcp float %D, %D
132+
tail call void @foo_4f(float %div, float %div1, float %div2, float %div3)
133+
ret void
134+
}
135+
136+
define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
137+
; CHECK-LABEL: splat_three_fdiv_4xfloat:
138+
; CHECK: // %bb.0:
139+
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
140+
; CHECK-NEXT: fmov v4.4s, #1.00000000
141+
; CHECK-NEXT: dup v0.4s, v0.s[0]
142+
; CHECK-NEXT: fdiv v4.4s, v4.4s, v0.4s
143+
; CHECK-NEXT: fmul v0.4s, v1.4s, v4.4s
144+
; CHECK-NEXT: fmul v1.4s, v2.4s, v4.4s
145+
; CHECK-NEXT: fmul v2.4s, v3.4s, v4.4s
146+
; CHECK-NEXT: b foo_3_4xf
158147
%D.ins = insertelement <4 x float> poison, float %D, i64 0
159148
%splat = shufflevector <4 x float> %D.ins, <4 x float> poison, <4 x i32> zeroinitializer
160149
%div = fdiv arcp <4 x float> %a, %splat
@@ -256,6 +245,7 @@ entry:
256245
}
257246

258247
declare void @foo_3f(float, float, float)
248+
declare void @foo_4f(float, float, float, float)
259249
declare void @foo_3d(double, double, double)
260250
declare void @foo_3_4xf(<4 x float>, <4 x float>, <4 x float>)
261251
declare void @foo_3_2xd(<2 x double>, <2 x double>, <2 x double>)

0 commit comments

Comments
 (0)