Skip to content

Commit c186994

Browse files
committed
Moved splitting to later PR
1 parent 5db25bf commit c186994

File tree

3 files changed

+100
-154
lines changed

3 files changed

+100
-154
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1644,13 +1644,8 @@ class TargetLoweringBase {
16441644
/// larger size, needs to be expanded to some other code sequence, or the
16451645
/// target has a custom expander for it.
16461646
LegalizeAction getPartialReduceMLAAction(EVT AccVT, EVT InputVT) const {
1647-
auto AccSVT = AccVT.getSimpleVT();
1648-
auto InputSVT = InputVT.getSimpleVT();
1649-
assert(AccSVT.isValid() && InputSVT.isValid() &&
1650-
"getPartialReduceMLAAction types aren't valid");
1651-
auto AccI = AccSVT.SimpleTy;
1652-
auto InputI = InputSVT.SimpleTy;
1653-
PartialReduceActionTypes TypePair = std::make_pair(AccI, InputI);
1647+
PartialReduceActionTypes TypePair = {AccVT.getSimpleVT().SimpleTy,
1648+
InputVT.getSimpleVT().SimpleTy};
16541649
auto It = PartialReduceMLAActions.find(TypePair);
16551650
if (It != PartialReduceMLAActions.end())
16561651
return It->second;
@@ -1660,8 +1655,8 @@ class TargetLoweringBase {
16601655
/// Return true if a PARTIAL_REDUCE_U/SMLA node with the specified types is
16611656
/// legal or custom for this target.
16621657
bool isPartialReduceMLALegalOrCustom(EVT AccVT, EVT InputVT) const {
1663-
return getPartialReduceMLAAction(AccVT, InputVT) == Legal ||
1664-
getPartialReduceMLAAction(AccVT, InputVT) == Custom;
1658+
LegalizeAction Action = getPartialReduceMLAAction(AccVT, InputVT);
1659+
return Action == Legal || Action == Custom;
16651660
}
16661661

16671662
/// If the action for this operation is to promote, this method returns the
@@ -2745,9 +2740,7 @@ class TargetLoweringBase {
27452740
LegalizeAction Action) {
27462741
assert(AccVT.isValid() && InputVT.isValid() &&
27472742
"setPartialReduceMLAAction types aren't valid");
2748-
auto AccI = AccVT.SimpleTy;
2749-
auto InputI = InputVT.SimpleTy;
2750-
PartialReduceActionTypes TypePair = std::make_pair(AccI, InputI);
2743+
PartialReduceActionTypes TypePair = {AccVT.SimpleTy, InputVT.SimpleTy};
27512744
PartialReduceMLAActions[TypePair] = Action;
27522745
}
27532746

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 22 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1585,44 +1585,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
15851585
setOperationAction(ISD::MSTORE, VT, Custom);
15861586
}
15871587

1588-
if (EnablePartialReduceNodes) {
1589-
// Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
1590-
// Other pairs will default to 'Expand'.
1591-
setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv8i16, Legal);
1592-
setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i8, Legal);
1593-
1594-
auto CanSplitToLegalPartialReduce = [&](MVT AccTy, MVT InTy) {
1595-
while (true) {
1596-
switch (getTypeAction(AccTy)) {
1597-
case TargetLoweringBase::TypeLegal:
1598-
return isPartialReduceMLALegalOrCustom(AccTy, InTy);
1599-
case TargetLoweringBase::TypeSplitVector:
1600-
if (!InTy.getVectorElementCount().isKnownEven())
1601-
return false;
1602-
// Currently, we only implement spillting for partial reductions,
1603-
// which splits the result and both operands.
1604-
AccTy = AccTy.getHalfNumVectorElementsVT();
1605-
InTy = InTy.getHalfNumVectorElementsVT();
1606-
break;
1607-
default:
1608-
// Assume all other type pairs are expanded.
1609-
return false;
1610-
}
1611-
}
1612-
};
1613-
1614-
for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
1615-
for (MVT InnerVT : MVT::integer_scalable_vector_valuetypes()) {
1616-
// Mark illegal type pairs that split to a legal pair as legal.
1617-
// This is needed as otherwise we may not apply useful combines.
1618-
if (!isTypeLegal(VT) || !isTypeLegal(InnerVT)) {
1619-
if (CanSplitToLegalPartialReduce(VT, InnerVT))
1620-
setPartialReduceMLAAction(VT, InnerVT, Legal);
1621-
}
1622-
}
1623-
}
1624-
}
1625-
16261588
// Firstly, exclude all scalable vector extending loads/truncating stores,
16271589
// include both integer and floating scalable vector.
16281590
for (MVT VT : MVT::scalable_vector_valuetypes()) {
@@ -1868,6 +1830,28 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
18681830
setOperationAction(ISD::INTRINSIC_WO_CHAIN, VT, Custom);
18691831
}
18701832

1833+
// Handle partial reduction operations
1834+
if (EnablePartialReduceNodes) {
1835+
auto SetPartialReductionMLAActionAsAppropriate = [&](MVT AccVt,
1836+
MVT InnerVT) -> void {
1837+
if (!isTypeLegal(AccVt) || !isTypeLegal(InnerVT))
1838+
setPartialReduceMLAAction(AccVt, InnerVT, Legal);
1839+
};
1840+
1841+
if (Subtarget->isSVEorStreamingSVEAvailable()) {
1842+
// Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
1843+
// Other pairs will default to 'Expand'.
1844+
setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv8i16, Legal);
1845+
setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i8, Legal);
1846+
1847+
for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
1848+
for (MVT InnerVT : MVT::integer_scalable_vector_valuetypes()) {
1849+
SetPartialReductionMLAActionAsAppropriate(VT, InnerVT);
1850+
}
1851+
}
1852+
}
1853+
}
1854+
18711855
// Handle operations that are only available in non-streaming SVE mode.
18721856
if (Subtarget->isSVEAvailable()) {
18731857
for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
@@ -1907,7 +1891,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
19071891
}
19081892
}
19091893

1910-
19111894
if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
19121895
// Only required for llvm.aarch64.mops.memset.tag
19131896
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);

llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll

Lines changed: 73 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -206,59 +206,46 @@ define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
206206
;
207207
; CHECK-NEWLOWERING-LABEL: udot_8to64:
208208
; CHECK-NEWLOWERING: // %bb.0: // %entry
209-
; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
210-
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
211-
; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
212-
; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
213-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
214-
; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
215-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
216-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
217-
; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
218-
; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b
219-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
209+
; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z3.b
210+
; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z2.b
220211
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
212+
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
221213
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
222214
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
223-
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
224215
; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h
216+
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
225217
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
226-
; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h
227-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
228-
; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
229-
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
230-
; CHECK-NEWLOWERING-NEXT: uunpkhi z26.d, z6.s
231-
; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z6.s
232-
; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z4.s
233-
; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z7.s
218+
; CHECK-NEWLOWERING-NEXT: uunpkhi z24.s, z3.h
219+
; CHECK-NEWLOWERING-NEXT: uunpkhi z25.s, z2.h
220+
; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z3.h
221+
; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z2.h
222+
; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z6.s
223+
; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z7.s
224+
; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z6.s
225+
; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
226+
; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z4.s
234227
; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z5.s
235228
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
236-
; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
237229
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
238-
; CHECK-NEWLOWERING-NEXT: uunpkhi z30.d, z24.s
239-
; CHECK-NEWLOWERING-NEXT: uunpkhi z31.d, z2.s
240-
; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z24.s
241-
; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z2.s
242-
; CHECK-NEWLOWERING-NEXT: uunpkhi z8.d, z25.s
243-
; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z25.s
244-
; CHECK-NEWLOWERING-NEXT: uunpklo z9.d, z3.s
245-
; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d
246-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d
230+
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z26.d
231+
; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z24.s
232+
; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z24.s
233+
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z6.d
234+
; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z25.s
235+
; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z3.s
236+
; CHECK-NEWLOWERING-NEXT: mul z27.d, z29.d, z28.d
237+
; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z2.s
238+
; CHECK-NEWLOWERING-NEXT: uunpkhi z25.d, z25.s
247239
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
248-
; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
249-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
250-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
251-
; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
252-
; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
253-
; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
254-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
255-
; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
256-
; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
257-
; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
258-
; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
259-
; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
260-
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
261-
; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
240+
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
241+
; CHECK-NEWLOWERING-NEXT: mul z4.d, z5.d, z4.d
242+
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z26.d
243+
; CHECK-NEWLOWERING-NEXT: movprfx z5, z27
244+
; CHECK-NEWLOWERING-NEXT: mla z5.d, p0/m, z28.d, z7.d
245+
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z25.d, z24.d
246+
; CHECK-NEWLOWERING-NEXT: mad z2.d, p0/m, z3.d, z4.d
247+
; CHECK-NEWLOWERING-NEXT: add z0.d, z5.d, z0.d
248+
; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d
262249
; CHECK-NEWLOWERING-NEXT: ret
263250
entry:
264251
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -282,59 +269,46 @@ define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
282269
;
283270
; CHECK-NEWLOWERING-LABEL: sdot_8to64:
284271
; CHECK-NEWLOWERING: // %bb.0: // %entry
285-
; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
286-
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
287-
; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
288-
; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
289-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
290-
; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
291-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
292-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
293-
; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
294-
; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b
295-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
272+
; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z3.b
273+
; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z2.b
296274
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
275+
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
297276
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
298277
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
299-
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
300278
; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h
279+
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
301280
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
302-
; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h
303-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
304-
; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
305-
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
306-
; CHECK-NEWLOWERING-NEXT: sunpkhi z26.d, z6.s
307-
; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z6.s
308-
; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z4.s
309-
; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z7.s
281+
; CHECK-NEWLOWERING-NEXT: sunpkhi z24.s, z3.h
282+
; CHECK-NEWLOWERING-NEXT: sunpkhi z25.s, z2.h
283+
; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z3.h
284+
; CHECK-NEWLOWERING-NEXT: sunpklo z2.s, z2.h
285+
; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z6.s
286+
; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z7.s
287+
; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z6.s
288+
; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
289+
; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z4.s
310290
; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z5.s
311291
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
312-
; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
313292
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
314-
; CHECK-NEWLOWERING-NEXT: sunpkhi z30.d, z24.s
315-
; CHECK-NEWLOWERING-NEXT: sunpkhi z31.d, z2.s
316-
; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z24.s
317-
; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z2.s
318-
; CHECK-NEWLOWERING-NEXT: sunpkhi z8.d, z25.s
319-
; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z25.s
320-
; CHECK-NEWLOWERING-NEXT: sunpklo z9.d, z3.s
321-
; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d
322-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d
293+
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z26.d
294+
; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z24.s
295+
; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z24.s
296+
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z6.d
297+
; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z25.s
298+
; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z3.s
299+
; CHECK-NEWLOWERING-NEXT: mul z27.d, z29.d, z28.d
300+
; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z2.s
301+
; CHECK-NEWLOWERING-NEXT: sunpkhi z25.d, z25.s
323302
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
324-
; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
325-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
326-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
327-
; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
328-
; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
329-
; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
330-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
331-
; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
332-
; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
333-
; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
334-
; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
335-
; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
336-
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
337-
; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
303+
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
304+
; CHECK-NEWLOWERING-NEXT: mul z4.d, z5.d, z4.d
305+
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z26.d
306+
; CHECK-NEWLOWERING-NEXT: movprfx z5, z27
307+
; CHECK-NEWLOWERING-NEXT: mla z5.d, p0/m, z28.d, z7.d
308+
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z25.d, z24.d
309+
; CHECK-NEWLOWERING-NEXT: mad z2.d, p0/m, z3.d, z4.d
310+
; CHECK-NEWLOWERING-NEXT: add z0.d, z5.d, z0.d
311+
; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d
338312
; CHECK-NEWLOWERING-NEXT: ret
339313
entry:
340314
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -816,11 +790,11 @@ define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %
816790
; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff
817791
; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff
818792
; CHECK-NEWLOWERING-NEXT: ptrue p0.s
819-
; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h
820-
; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h
821-
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
793+
; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z2.h
794+
; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z1.h
822795
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
823-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z3.s, z4.s
796+
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
797+
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z4.s, z3.s
824798
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
825799
; CHECK-NEWLOWERING-NEXT: ret
826800
entry:
@@ -850,11 +824,11 @@ define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x
850824
; CHECK-NEWLOWERING-NEXT: and z1.s, z1.s, #0xffff
851825
; CHECK-NEWLOWERING-NEXT: and z2.s, z2.s, #0xffff
852826
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
853-
; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z1.s
854-
; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z2.s
855-
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
827+
; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z2.s
828+
; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z1.s
856829
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
857-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z3.d, z4.d
830+
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
831+
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z3.d
858832
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d
859833
; CHECK-NEWLOWERING-NEXT: ret
860834
entry:
@@ -1221,10 +1195,8 @@ define <vscale x 2 x i16> @udot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale
12211195
;
12221196
; CHECK-NEWLOWERING-LABEL: udot_nxv8i8_promote:
12231197
; CHECK-NEWLOWERING: // %bb.0: // %entry
1224-
; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff
12251198
; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff
1226-
; CHECK-NEWLOWERING-NEXT: mul z1.h, z1.h, z2.h
1227-
; CHECK-NEWLOWERING-NEXT: mov z2.h, #1 // =0x1
1199+
; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff
12281200
; CHECK-NEWLOWERING-NEXT: udot z0.d, z1.h, z2.h
12291201
; CHECK-NEWLOWERING-NEXT: ret
12301202
entry:
@@ -1257,11 +1229,9 @@ define <vscale x 2 x i16> @sdot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale
12571229
; CHECK-NEWLOWERING-LABEL: sdot_nxv8i8_promote:
12581230
; CHECK-NEWLOWERING: // %bb.0: // %entry
12591231
; CHECK-NEWLOWERING-NEXT: ptrue p0.h
1260-
; CHECK-NEWLOWERING-NEXT: sxtb z1.h, p0/m, z1.h
12611232
; CHECK-NEWLOWERING-NEXT: sxtb z2.h, p0/m, z2.h
1262-
; CHECK-NEWLOWERING-NEXT: mul z1.h, z1.h, z2.h
1263-
; CHECK-NEWLOWERING-NEXT: mov z2.h, #1 // =0x1
1264-
; CHECK-NEWLOWERING-NEXT: udot z0.d, z1.h, z2.h
1233+
; CHECK-NEWLOWERING-NEXT: sxtb z1.h, p0/m, z1.h
1234+
; CHECK-NEWLOWERING-NEXT: sdot z0.d, z1.h, z2.h
12651235
; CHECK-NEWLOWERING-NEXT: ret
12661236
entry:
12671237
%a.wide = sext <vscale x 8 x i8> %a to <vscale x 8 x i16>

0 commit comments

Comments
 (0)