Skip to content

Commit 5db25bf

Browse files
committed
Adjust partial reduction type legality check
1 parent 364835d commit 5db25bf

File tree

2 files changed

+132
-84
lines changed

2 files changed

+132
-84
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1586,23 +1586,41 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
15861586
}
15871587

15881588
if (EnablePartialReduceNodes) {
1589+
// Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
1590+
// Other pairs will default to 'Expand'.
1591+
setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv8i16, Legal);
1592+
setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i8, Legal);
1593+
1594+
auto CanSplitToLegalPartialReduce = [&](MVT AccTy, MVT InTy) {
1595+
while (true) {
1596+
switch (getTypeAction(AccTy)) {
1597+
case TargetLoweringBase::TypeLegal:
1598+
return isPartialReduceMLALegalOrCustom(AccTy, InTy);
1599+
case TargetLoweringBase::TypeSplitVector:
1600+
if (!InTy.getVectorElementCount().isKnownEven())
1601+
return false;
1602+
// Currently, we only implement spillting for partial reductions,
1603+
// which splits the result and both operands.
1604+
AccTy = AccTy.getHalfNumVectorElementsVT();
1605+
InTy = InTy.getHalfNumVectorElementsVT();
1606+
break;
1607+
default:
1608+
// Assume all other type pairs are expanded.
1609+
return false;
1610+
}
1611+
}
1612+
};
1613+
15891614
for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
15901615
for (MVT InnerVT : MVT::integer_scalable_vector_valuetypes()) {
1591-
// 1. Set all combinations where a type is illegal to "Legal"
1592-
// - These will be legalized to a legal type pair
1593-
// - Avoid expanding them too early (or preventing folds)
1616+
// Mark illegal type pairs that split to a legal pair as legal.
1617+
// This is needed as otherwise we may not apply useful combines.
15941618
if (!isTypeLegal(VT) || !isTypeLegal(InnerVT)) {
1595-
setPartialReduceMLAAction(VT, InnerVT, Legal);
1596-
continue;
1619+
if (CanSplitToLegalPartialReduce(VT, InnerVT))
1620+
setPartialReduceMLAAction(VT, InnerVT, Legal);
15971621
}
1598-
// 2. Set all legal combinations to "Expand"
1599-
// - Not all of these can be lowered (via a Legal or Custom lowering).
1600-
setPartialReduceMLAAction(VT, InnerVT, Expand);
16011622
}
16021623
}
1603-
// 3. Mark known legal pairs as 'Legal' (these will expand to USDOT).
1604-
setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv8i16, Legal);
1605-
setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i8, Legal);
16061624
}
16071625

16081626
// Firstly, exclude all scalable vector extending loads/truncating stores,

llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll

Lines changed: 103 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -206,46 +206,59 @@ define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
206206
;
207207
; CHECK-NEWLOWERING-LABEL: udot_8to64:
208208
; CHECK-NEWLOWERING: // %bb.0: // %entry
209-
; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z3.b
210-
; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z2.b
211-
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
209+
; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
210+
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
211+
; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
212+
; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
213+
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
214+
; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
215+
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
216+
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
217+
; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
218+
; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b
212219
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
220+
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
213221
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
214222
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
215-
; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h
216223
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
224+
; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h
217225
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
218-
; CHECK-NEWLOWERING-NEXT: uunpkhi z24.s, z3.h
219-
; CHECK-NEWLOWERING-NEXT: uunpkhi z25.s, z2.h
220-
; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z3.h
221-
; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z2.h
222-
; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z6.s
223-
; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z7.s
224-
; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z6.s
225-
; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
226-
; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z4.s
226+
; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h
227+
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
228+
; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
229+
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
230+
; CHECK-NEWLOWERING-NEXT: uunpkhi z26.d, z6.s
231+
; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z6.s
232+
; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z4.s
233+
; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z7.s
227234
; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z5.s
228235
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
236+
; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
229237
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
230-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z26.d
231-
; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z24.s
232-
; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z24.s
233-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z6.d
234-
; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z25.s
235-
; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z3.s
236-
; CHECK-NEWLOWERING-NEXT: mul z27.d, z29.d, z28.d
237-
; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z2.s
238-
; CHECK-NEWLOWERING-NEXT: uunpkhi z25.d, z25.s
238+
; CHECK-NEWLOWERING-NEXT: uunpkhi z30.d, z24.s
239+
; CHECK-NEWLOWERING-NEXT: uunpkhi z31.d, z2.s
240+
; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z24.s
241+
; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z2.s
242+
; CHECK-NEWLOWERING-NEXT: uunpkhi z8.d, z25.s
243+
; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z25.s
244+
; CHECK-NEWLOWERING-NEXT: uunpklo z9.d, z3.s
245+
; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d
246+
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d
239247
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
240-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
241-
; CHECK-NEWLOWERING-NEXT: mul z4.d, z5.d, z4.d
242-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z26.d
243-
; CHECK-NEWLOWERING-NEXT: movprfx z5, z27
244-
; CHECK-NEWLOWERING-NEXT: mla z5.d, p0/m, z28.d, z7.d
245-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z25.d, z24.d
246-
; CHECK-NEWLOWERING-NEXT: mad z2.d, p0/m, z3.d, z4.d
247-
; CHECK-NEWLOWERING-NEXT: add z0.d, z5.d, z0.d
248-
; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d
248+
; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
249+
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
250+
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
251+
; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
252+
; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
253+
; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
254+
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
255+
; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
256+
; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
257+
; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
258+
; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
259+
; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
260+
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
261+
; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
249262
; CHECK-NEWLOWERING-NEXT: ret
250263
entry:
251264
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -269,46 +282,59 @@ define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
269282
;
270283
; CHECK-NEWLOWERING-LABEL: sdot_8to64:
271284
; CHECK-NEWLOWERING: // %bb.0: // %entry
272-
; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z3.b
273-
; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z2.b
274-
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
285+
; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
286+
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
287+
; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
288+
; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
289+
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
290+
; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
291+
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
292+
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
293+
; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
294+
; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b
275295
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
296+
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
276297
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
277298
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
278-
; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h
279299
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
300+
; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h
280301
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
281-
; CHECK-NEWLOWERING-NEXT: sunpkhi z24.s, z3.h
282-
; CHECK-NEWLOWERING-NEXT: sunpkhi z25.s, z2.h
283-
; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z3.h
284-
; CHECK-NEWLOWERING-NEXT: sunpklo z2.s, z2.h
285-
; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z6.s
286-
; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z7.s
287-
; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z6.s
288-
; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
289-
; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z4.s
302+
; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h
303+
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
304+
; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
305+
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
306+
; CHECK-NEWLOWERING-NEXT: sunpkhi z26.d, z6.s
307+
; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z6.s
308+
; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z4.s
309+
; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z7.s
290310
; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z5.s
291311
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
312+
; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
292313
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
293-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z26.d
294-
; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z24.s
295-
; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z24.s
296-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z6.d
297-
; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z25.s
298-
; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z3.s
299-
; CHECK-NEWLOWERING-NEXT: mul z27.d, z29.d, z28.d
300-
; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z2.s
301-
; CHECK-NEWLOWERING-NEXT: sunpkhi z25.d, z25.s
314+
; CHECK-NEWLOWERING-NEXT: sunpkhi z30.d, z24.s
315+
; CHECK-NEWLOWERING-NEXT: sunpkhi z31.d, z2.s
316+
; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z24.s
317+
; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z2.s
318+
; CHECK-NEWLOWERING-NEXT: sunpkhi z8.d, z25.s
319+
; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z25.s
320+
; CHECK-NEWLOWERING-NEXT: sunpklo z9.d, z3.s
321+
; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d
322+
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d
302323
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
303-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
304-
; CHECK-NEWLOWERING-NEXT: mul z4.d, z5.d, z4.d
305-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z26.d
306-
; CHECK-NEWLOWERING-NEXT: movprfx z5, z27
307-
; CHECK-NEWLOWERING-NEXT: mla z5.d, p0/m, z28.d, z7.d
308-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z25.d, z24.d
309-
; CHECK-NEWLOWERING-NEXT: mad z2.d, p0/m, z3.d, z4.d
310-
; CHECK-NEWLOWERING-NEXT: add z0.d, z5.d, z0.d
311-
; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d
324+
; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
325+
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
326+
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
327+
; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
328+
; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
329+
; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
330+
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
331+
; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
332+
; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
333+
; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
334+
; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
335+
; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
336+
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
337+
; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
312338
; CHECK-NEWLOWERING-NEXT: ret
313339
entry:
314340
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -790,11 +816,11 @@ define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %
790816
; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff
791817
; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff
792818
; CHECK-NEWLOWERING-NEXT: ptrue p0.s
793-
; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z2.h
794-
; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z1.h
795-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
819+
; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h
820+
; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h
796821
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
797-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z4.s, z3.s
822+
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
823+
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z3.s, z4.s
798824
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
799825
; CHECK-NEWLOWERING-NEXT: ret
800826
entry:
@@ -824,11 +850,11 @@ define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x
824850
; CHECK-NEWLOWERING-NEXT: and z1.s, z1.s, #0xffff
825851
; CHECK-NEWLOWERING-NEXT: and z2.s, z2.s, #0xffff
826852
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
827-
; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z2.s
828-
; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z1.s
829-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
853+
; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z1.s
854+
; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z2.s
830855
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
831-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z3.d
856+
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
857+
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z3.d, z4.d
832858
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d
833859
; CHECK-NEWLOWERING-NEXT: ret
834860
entry:
@@ -1195,8 +1221,10 @@ define <vscale x 2 x i16> @udot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale
11951221
;
11961222
; CHECK-NEWLOWERING-LABEL: udot_nxv8i8_promote:
11971223
; CHECK-NEWLOWERING: // %bb.0: // %entry
1198-
; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff
11991224
; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff
1225+
; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff
1226+
; CHECK-NEWLOWERING-NEXT: mul z1.h, z1.h, z2.h
1227+
; CHECK-NEWLOWERING-NEXT: mov z2.h, #1 // =0x1
12001228
; CHECK-NEWLOWERING-NEXT: udot z0.d, z1.h, z2.h
12011229
; CHECK-NEWLOWERING-NEXT: ret
12021230
entry:
@@ -1229,9 +1257,11 @@ define <vscale x 2 x i16> @sdot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale
12291257
; CHECK-NEWLOWERING-LABEL: sdot_nxv8i8_promote:
12301258
; CHECK-NEWLOWERING: // %bb.0: // %entry
12311259
; CHECK-NEWLOWERING-NEXT: ptrue p0.h
1232-
; CHECK-NEWLOWERING-NEXT: sxtb z2.h, p0/m, z2.h
12331260
; CHECK-NEWLOWERING-NEXT: sxtb z1.h, p0/m, z1.h
1234-
; CHECK-NEWLOWERING-NEXT: sdot z0.d, z1.h, z2.h
1261+
; CHECK-NEWLOWERING-NEXT: sxtb z2.h, p0/m, z2.h
1262+
; CHECK-NEWLOWERING-NEXT: mul z1.h, z1.h, z2.h
1263+
; CHECK-NEWLOWERING-NEXT: mov z2.h, #1 // =0x1
1264+
; CHECK-NEWLOWERING-NEXT: udot z0.d, z1.h, z2.h
12351265
; CHECK-NEWLOWERING-NEXT: ret
12361266
entry:
12371267
%a.wide = sext <vscale x 8 x i8> %a to <vscale x 8 x i16>

0 commit comments

Comments
 (0)