Skip to content

Commit 1adef4d

Browse files
committed
Add different argument order pattern for mul
Optimize mul for merge variants where the splat value is the first operand. This optimization is equal to a neg that as a (mov -1) and op2 as operands. Remove a redundant commutativity pattern for undef variants of mul since the splat value should appear only as second operand after instcombine. Adapt the tests to test for the new patterns and remove redundant ones for non-optimization cases.
1 parent 725e197 commit 1adef4d

File tree

3 files changed

+36
-130
lines changed

3 files changed

+36
-130
lines changed

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -737,10 +737,10 @@ let Predicates = [HasSVE_or_SME] in {
737737
def : SVE_2_Op_Neg_One_Passthru_Pat<nxv2i64, AArch64mul_m1, nxv2i1, NEG_ZPmZ_D , i64>;
738738

739739
let AddedComplexity = 5 in {
740-
defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv16i8, AArch64mul_p, nxv16i1, NEG_ZPmZ_B_UNDEF, i32>;
741-
defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv8i16, AArch64mul_p, nxv8i1, NEG_ZPmZ_H_UNDEF, i32>;
742-
defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv4i32, AArch64mul_p, nxv4i1, NEG_ZPmZ_S_UNDEF, i32>;
743-
defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv2i64, AArch64mul_p, nxv2i1, NEG_ZPmZ_D_UNDEF, i64>;
740+
def : SVE_2_Op_Neg_One_Passthru_Pat<nxv16i8, AArch64mul_p, nxv16i1, NEG_ZPmZ_B_UNDEF, i32>;
741+
def : SVE_2_Op_Neg_One_Passthru_Pat<nxv8i16, AArch64mul_p, nxv8i1, NEG_ZPmZ_H_UNDEF, i32>;
742+
def : SVE_2_Op_Neg_One_Passthru_Pat<nxv4i32, AArch64mul_p, nxv4i1, NEG_ZPmZ_S_UNDEF, i32>;
743+
def : SVE_2_Op_Neg_One_Passthru_Pat<nxv2i64, AArch64mul_p, nxv2i1, NEG_ZPmZ_D_UNDEF, i64>;
744744
}
745745

746746
defm CLS_ZPmZ : sve_int_un_pred_arit_bitwise< 0b000, "cls", AArch64cls_mt>;
@@ -1025,6 +1025,11 @@ let Predicates = [HasSVE_or_SME] in {
10251025
defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>;
10261026

10271027
defm SPLICE_ZPZ : sve_int_perm_splice<"splice", AArch64splice>;
1028+
1029+
def : SVE_2_Op_Neg_One_Replace_Pat<nxv16i8, AArch64mul_m1, nxv16i1, NEG_ZPmZ_B , DUP_ZI_B, i32>;
1030+
def : SVE_2_Op_Neg_One_Replace_Pat<nxv8i16, AArch64mul_m1, nxv8i1, NEG_ZPmZ_H , DUP_ZI_H, i32>;
1031+
def : SVE_2_Op_Neg_One_Replace_Pat<nxv4i32, AArch64mul_m1, nxv4i1, NEG_ZPmZ_S , DUP_ZI_S, i32>;
1032+
def : SVE_2_Op_Neg_One_Replace_Pat<nxv2i64, AArch64mul_m1, nxv2i1, NEG_ZPmZ_D , DUP_ZI_D, i64>;
10281033
} // End HasSVE_or_SME
10291034

10301035
// COMPACT - word and doubleword

llvm/lib/Target/AArch64/SVEInstrFormats.td

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -723,19 +723,16 @@ class SVE2p1_Cvt_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out
723723
: Pat<(out_vt (intrinsic in_vt:$Zn1, in_vt:$Zn2)),
724724
(!cast<Instruction>(name) (REG_SEQUENCE ZPR2Mul2, in_vt:$Zn1, zsub0, in_vt:$Zn2, zsub1))>;
725725

726-
class SVE_2_Op_Neg_One_Passthru_Pat<ValueType vt, SDPatternOperator op, ValueType pt,
726+
class SVE_2_Op_Neg_One_Replace_Pat<ValueType vt, SDPatternOperator op, ValueType pt,
727+
Instruction inst, Instruction dup_inst, ValueType immT>
728+
: Pat<(vt (op pt:$Op1, (vt (splat_vector (immT -1))), vt:$Op2)),
729+
(inst (dup_inst -1, 0) , $Op1, $Op2)>;
730+
731+
class SVE_2_Op_Neg_One_Passthru_Pat<ValueType vt, SDPatternOperator op, ValueType pt,
727732
Instruction inst, ValueType immT>
728733
: Pat<(vt (op pt:$Op1, vt:$Op2, (vt (splat_vector (immT -1))))),
729734
(inst $Op2, $Op1, $Op2)>;
730735

731-
// Same as above, but commutative
732-
multiclass SVE_2_Op_Neg_One_Passthru_Pat_Comm<ValueType vt, SDPatternOperator op, ValueType pt,
733-
Instruction inst, ValueType immT> {
734-
def : Pat<(vt (op pt:$Op1, vt:$Op2, (vt (splat_vector (immT -1))))),
735-
(inst $Op2, $Op1, $Op2)>;
736-
def : Pat<(vt (op pt:$Op1, (vt (splat_vector (immT -1))), vt:$Op2)),
737-
(inst $Op2, $Op1, $Op2)>;
738-
}
739736
//===----------------------------------------------------------------------===//
740737
// SVE pattern match helpers.
741738
//===----------------------------------------------------------------------===//

llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll

Lines changed: 21 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -44,20 +44,6 @@ define <vscale x 2 x i64> @mul_neg_fold_i64(<vscale x 2 x i1> %pg, <vscale x 2 x
4444
ret <vscale x 2 x i64> %2
4545
}
4646

47-
define <vscale x 8 x i16> @mul_neg_fold_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
48-
; Edge case -- make sure that the case where we're multiplying two dups
49-
; together is sane.
50-
; CHECK-LABEL: mul_neg_fold_two_dups:
51-
; CHECK: // %bb.0:
52-
; CHECK-NEXT: mov z0.h, #-1 // =0xffffffffffffffff
53-
; CHECK-NEXT: neg z0.h, p0/m, z0.h
54-
; CHECK-NEXT: ret
55-
%1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
56-
%2 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
57-
%3 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2)
58-
ret <vscale x 8 x i16> %3
59-
}
60-
6147
define <vscale x 16 x i8> @mul_neg_fold_u_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
6248
; CHECK-LABEL: mul_neg_fold_u_i8:
6349
; CHECK: // %bb.0:
@@ -98,136 +84,54 @@ define <vscale x 2 x i64> @mul_neg_fold_u_i64(<vscale x 2 x i1> %pg, <vscale x 2
9884
ret <vscale x 2 x i64> %2
9985
}
10086

101-
define <vscale x 8 x i16> @mul_neg_fold_u_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
102-
; CHECK-LABEL: mul_neg_fold_u_two_dups:
103-
; CHECK: // %bb.0:
104-
; CHECK-NEXT: mov z0.h, #-1 // =0xffffffffffffffff
105-
; CHECK-NEXT: neg z0.h, p0/m, z0.h
106-
; CHECK-NEXT: ret
107-
%1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
108-
%2 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
109-
%3 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2)
110-
ret <vscale x 8 x i16> %3
111-
}
112-
113-
; Undefined mul is commutative
114-
define <vscale x 16 x i8> @mul_neg_fold_u_different_argument_order_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
115-
; CHECK-LABEL: mul_neg_fold_u_different_argument_order_i8:
87+
define <vscale x 16 x i8> @mul_neg_fold_different_argument_order_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
88+
; CHECK-LABEL: mul_neg_fold_different_argument_order_i8:
11689
; CHECK: // %bb.0:
117-
; CHECK-NEXT: neg z0.b, p0/m, z0.b
90+
; CHECK-NEXT: mov z1.b, #-1 // =0xffffffffffffffff
91+
; CHECK-NEXT: neg z1.b, p0/m, z0.b
92+
; CHECK-NEXT: mov z0.d, z1.d
11893
; CHECK-NEXT: ret
11994
%1 = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8(i8 -1)
120-
%2 = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.u.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %1, <vscale x 16 x i8> %a)
95+
%2 = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %1, <vscale x 16 x i8> %a)
12196
ret <vscale x 16 x i8> %2
12297
}
12398

124-
define <vscale x 8 x i16> @mul_neg_fold_u_different_argument_order_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
125-
; CHECK-LABEL: mul_neg_fold_u_different_argument_order_i16:
99+
define <vscale x 8 x i16> @mul_neg_fold_different_argument_order_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
100+
; CHECK-LABEL: mul_neg_fold_different_argument_order_i16:
126101
; CHECK: // %bb.0:
127-
; CHECK-NEXT: neg z0.h, p0/m, z0.h
102+
; CHECK-NEXT: mov z1.h, #-1 // =0xffffffffffffffff
103+
; CHECK-NEXT: neg z1.h, p0/m, z0.h
104+
; CHECK-NEXT: mov z0.d, z1.d
128105
; CHECK-NEXT: ret
129106
%1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
130-
%2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %a)
107+
%2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %a)
131108
ret <vscale x 8 x i16> %2
132109
}
133110

134-
define <vscale x 4 x i32> @mul_neg_fold_u_different_argument_order_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
135-
; CHECK-LABEL: mul_neg_fold_u_different_argument_order_i32:
111+
define <vscale x 4 x i32> @mul_neg_fold_different_argument_order_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
112+
; CHECK-LABEL: mul_neg_fold_different_argument_order_i32:
136113
; CHECK: // %bb.0:
137-
; CHECK-NEXT: neg z0.s, p0/m, z0.s
114+
; CHECK-NEXT: mov z1.s, #-1 // =0xffffffffffffffff
115+
; CHECK-NEXT: neg z1.s, p0/m, z0.s
116+
; CHECK-NEXT: mov z0.d, z1.d
138117
; CHECK-NEXT: ret
139118
%1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -1)
140-
%2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a)
119+
%2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a)
141120
ret <vscale x 4 x i32> %2
142121
}
143122

144-
define <vscale x 2 x i64> @mul_neg_fold_u_different_argument_order_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
145-
; CHECK-LABEL: mul_neg_fold_u_different_argument_order_i64:
146-
; CHECK: // %bb.0:
147-
; CHECK-NEXT: neg z0.d, p0/m, z0.d
148-
; CHECK-NEXT: ret
149-
%1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
150-
%2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %1, <vscale x 2 x i64> %a)
151-
ret <vscale x 2 x i64> %2
152-
}
153-
154-
; Non foldable muls -- we don't expect these to be optimised out.
155-
define <vscale x 8 x i16> @no_mul_neg_fold_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
156-
; CHECK-LABEL: no_mul_neg_fold_i16:
157-
; CHECK: // %bb.0:
158-
; CHECK-NEXT: mov z1.h, #-2 // =0xfffffffffffffffe
159-
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h
160-
; CHECK-NEXT: ret
161-
%1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -2)
162-
%2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
163-
ret <vscale x 8 x i16> %2
164-
}
165-
166-
define <vscale x 4 x i32> @no_mul_neg_fold_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
167-
; CHECK-LABEL: no_mul_neg_fold_i32:
168-
; CHECK: // %bb.0:
169-
; CHECK-NEXT: mov z1.s, #-2 // =0xfffffffffffffffe
170-
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
171-
; CHECK-NEXT: ret
172-
%1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -2)
173-
%2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
174-
ret <vscale x 4 x i32> %2
175-
}
176-
177-
define <vscale x 2 x i64> @no_mul_neg_fold_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
178-
; CHECK-LABEL: no_mul_neg_fold_i64:
179-
; CHECK: // %bb.0:
180-
; CHECK-NEXT: mov z1.d, #-2 // =0xfffffffffffffffe
181-
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
182-
; CHECK-NEXT: ret
183-
%1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -2)
184-
%2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
185-
ret <vscale x 2 x i64> %2
186-
}
187-
188-
; Merge mul is non commutative
189-
define <vscale x 2 x i64> @no_mul_neg_fold_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
190-
; CHECK-LABEL: no_mul_neg_fold_different_argument_order:
123+
define <vscale x 2 x i64> @mul_neg_fold_different_argument_order_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
124+
; CHECK-LABEL: mul_neg_fold_different_argument_order_i64:
191125
; CHECK: // %bb.0:
192126
; CHECK-NEXT: mov z1.d, #-1 // =0xffffffffffffffff
193-
; CHECK-NEXT: mul z1.d, p0/m, z1.d, z0.d
127+
; CHECK-NEXT: neg z1.d, p0/m, z0.d
194128
; CHECK-NEXT: mov z0.d, z1.d
195129
; CHECK-NEXT: ret
196130
%1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
197131
%2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %1, <vscale x 2 x i64> %a)
198132
ret <vscale x 2 x i64> %2
199133
}
200134

201-
define <vscale x 8 x i16> @no_mul_neg_fold_u_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
202-
; CHECK-LABEL: no_mul_neg_fold_u_i16:
203-
; CHECK: // %bb.0:
204-
; CHECK-NEXT: mul z0.h, z0.h, #-2
205-
; CHECK-NEXT: ret
206-
%1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -2)
207-
%2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
208-
ret <vscale x 8 x i16> %2
209-
}
210-
211-
define <vscale x 4 x i32> @no_mul_neg_fold_u_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
212-
; CHECK-LABEL: no_mul_neg_fold_u_i32:
213-
; CHECK: // %bb.0:
214-
; CHECK-NEXT: mul z0.s, z0.s, #-2
215-
; CHECK-NEXT: ret
216-
%1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -2)
217-
%2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
218-
ret <vscale x 4 x i32> %2
219-
}
220-
221-
define <vscale x 2 x i64> @no_mul_neg_fold_u_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
222-
; CHECK-LABEL: no_mul_neg_fold_u_i64:
223-
; CHECK: // %bb.0:
224-
; CHECK-NEXT: mul z0.d, z0.d, #-2
225-
; CHECK-NEXT: ret
226-
%1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -2)
227-
%2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
228-
ret <vscale x 2 x i64> %2
229-
}
230-
231135
declare <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8(i8)
232136
declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16)
233137
declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)

0 commit comments

Comments
 (0)