@@ -434,6 +434,127 @@ define <4 x i32> @four_way_i8_i32_vl128_usdot(ptr %accptr, ptr %uptr, ptr %sptr)
434434 ret <4 x i32 > %partial.reduce
435435}
436436
437+ define <4 x i32 > @four_way_i8_i32_vl128_sudot (ptr %accptr , ptr %uptr , ptr %sptr ) {
438+ ; COMMON-LABEL: four_way_i8_i32_vl128_sudot:
439+ ; COMMON: // %bb.0:
440+ ; COMMON-NEXT: ldr q0, [x0]
441+ ; COMMON-NEXT: ldr q1, [x1]
442+ ; COMMON-NEXT: ldr q2, [x2]
443+ ; COMMON-NEXT: usdot v0.4s, v2.16b, v1.16b
444+ ; COMMON-NEXT: ret
445+ ;
446+ ; SME-LABEL: four_way_i8_i32_vl128_sudot:
447+ ; SME: // %bb.0:
448+ ; SME-NEXT: ldr q0, [x0]
449+ ; SME-NEXT: ldr q1, [x1]
450+ ; SME-NEXT: ldr q2, [x2]
451+ ; SME-NEXT: usdot z0.s, z2.b, z1.b
452+ ; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
453+ ; SME-NEXT: ret
454+ %acc = load <4 x i32 >, ptr %accptr
455+ %u = load <16 x i8 >, ptr %uptr
456+ %s = load <16 x i8 >, ptr %sptr
457+ %u.wide = sext <16 x i8 > %u to <16 x i32 >
458+ %s.wide = zext <16 x i8 > %s to <16 x i32 >
459+ %mult = mul nuw nsw <16 x i32 > %s.wide , %u.wide
460+ %partial.reduce = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add (<4 x i32 > %acc , <16 x i32 > %mult )
461+ ret <4 x i32 > %partial.reduce
462+ }
463+
464+ define <2 x i64 > @four_way_i8_i64_vl128_usdot (ptr %accptr , ptr %uptr , ptr %sptr ) {
465+ ; NEON-LABEL: four_way_i8_i64_vl128_usdot:
466+ ; NEON: // %bb.0:
467+ ; NEON-NEXT: movi v0.2d, #0000000000000000
468+ ; NEON-NEXT: ldr q1, [x1]
469+ ; NEON-NEXT: ldr q2, [x2]
470+ ; NEON-NEXT: usdot v0.4s, v1.16b, v2.16b
471+ ; NEON-NEXT: ldr q1, [x0]
472+ ; NEON-NEXT: saddw v1.2d, v1.2d, v0.2s
473+ ; NEON-NEXT: saddw2 v0.2d, v1.2d, v0.4s
474+ ; NEON-NEXT: ret
475+ ;
476+ ; SVE-LABEL: four_way_i8_i64_vl128_usdot:
477+ ; SVE: // %bb.0:
478+ ; SVE-NEXT: movi v0.2d, #0000000000000000
479+ ; SVE-NEXT: ldr q1, [x1]
480+ ; SVE-NEXT: ldr q2, [x2]
481+ ; SVE-NEXT: usdot z0.s, z1.b, z2.b
482+ ; SVE-NEXT: ldr q2, [x0]
483+ ; SVE-NEXT: sunpklo z1.d, z0.s
484+ ; SVE-NEXT: sunpkhi z0.d, z0.s
485+ ; SVE-NEXT: add z1.d, z2.d, z1.d
486+ ; SVE-NEXT: add z0.d, z1.d, z0.d
487+ ; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
488+ ; SVE-NEXT: ret
489+ ;
490+ ; SME-LABEL: four_way_i8_i64_vl128_usdot:
491+ ; SME: // %bb.0:
492+ ; SME-NEXT: mov z0.s, #0 // =0x0
493+ ; SME-NEXT: ldr q1, [x1]
494+ ; SME-NEXT: ldr q2, [x2]
495+ ; SME-NEXT: usdot z0.s, z1.b, z2.b
496+ ; SME-NEXT: ldr q1, [x0]
497+ ; SME-NEXT: saddwb z1.d, z1.d, z0.s
498+ ; SME-NEXT: saddwt z0.d, z1.d, z0.s
499+ ; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
500+ ; SME-NEXT: ret
501+ %acc = load <2 x i64 >, ptr %accptr
502+ %u = load <16 x i8 >, ptr %uptr
503+ %s = load <16 x i8 >, ptr %sptr
504+ %u.wide = zext <16 x i8 > %u to <16 x i64 >
505+ %s.wide = sext <16 x i8 > %s to <16 x i64 >
506+ %mult = mul nuw nsw <16 x i64 > %s.wide , %u.wide
507+ %partial.reduce = tail call <2 x i64 > @llvm.experimental.vector.partial.reduce.add (<2 x i64 > %acc , <16 x i64 > %mult )
508+ ret <2 x i64 > %partial.reduce
509+ }
510+
511+ define <2 x i64 > @four_way_i16_i64_vl128_usdot (ptr %accptr , ptr %uptr , ptr %sptr ) {
512+ ; COMMON-LABEL: four_way_i16_i64_vl128_usdot:
513+ ; COMMON: // %bb.0:
514+ ; COMMON-NEXT: ldr q1, [x1]
515+ ; COMMON-NEXT: ldr q2, [x2]
516+ ; COMMON-NEXT: ldr q0, [x0]
517+ ; COMMON-NEXT: ushll v3.4s, v1.4h, #0
518+ ; COMMON-NEXT: sshll v4.4s, v2.4h, #0
519+ ; COMMON-NEXT: ushll2 v1.4s, v1.8h, #0
520+ ; COMMON-NEXT: sshll2 v2.4s, v2.8h, #0
521+ ; COMMON-NEXT: smlal v0.2d, v4.2s, v3.2s
522+ ; COMMON-NEXT: smlal2 v0.2d, v4.4s, v3.4s
523+ ; COMMON-NEXT: smlal v0.2d, v2.2s, v1.2s
524+ ; COMMON-NEXT: smlal2 v0.2d, v2.4s, v1.4s
525+ ; COMMON-NEXT: ret
526+ ;
527+ ; SME-LABEL: four_way_i16_i64_vl128_usdot:
528+ ; SME: // %bb.0:
529+ ; SME-NEXT: ptrue p0.d, vl2
530+ ; SME-NEXT: ldr q2, [x0]
531+ ; SME-NEXT: mov x8, #2 // =0x2
532+ ; SME-NEXT: ld1h { z0.d }, p0/z, [x1]
533+ ; SME-NEXT: ld1sh { z1.d }, p0/z, [x2]
534+ ; SME-NEXT: mad z0.d, p0/m, z1.d, z2.d
535+ ; SME-NEXT: ld1h { z1.d }, p0/z, [x1, x8, lsl #1]
536+ ; SME-NEXT: ld1sh { z2.d }, p0/z, [x2, x8, lsl #1]
537+ ; SME-NEXT: mov x8, #4 // =0x4
538+ ; SME-NEXT: mla z0.d, p0/m, z2.d, z1.d
539+ ; SME-NEXT: ld1h { z1.d }, p0/z, [x1, x8, lsl #1]
540+ ; SME-NEXT: ld1sh { z2.d }, p0/z, [x2, x8, lsl #1]
541+ ; SME-NEXT: mov x8, #6 // =0x6
542+ ; SME-NEXT: mla z0.d, p0/m, z2.d, z1.d
543+ ; SME-NEXT: ld1h { z1.d }, p0/z, [x1, x8, lsl #1]
544+ ; SME-NEXT: ld1sh { z2.d }, p0/z, [x2, x8, lsl #1]
545+ ; SME-NEXT: mla z0.d, p0/m, z2.d, z1.d
546+ ; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
547+ ; SME-NEXT: ret
548+ %acc = load <2 x i64 >, ptr %accptr
549+ %u = load <8 x i16 >, ptr %uptr
550+ %s = load <8 x i16 >, ptr %sptr
551+ %u.wide = zext <8 x i16 > %u to <8 x i64 >
552+ %s.wide = sext <8 x i16 > %s to <8 x i64 >
553+ %mult = mul nuw nsw <8 x i64 > %s.wide , %u.wide
554+ %partial.reduce = tail call <2 x i64 > @llvm.experimental.vector.partial.reduce.add (<2 x i64 > %acc , <8 x i64 > %mult )
555+ ret <2 x i64 > %partial.reduce
556+ }
557+
437558define <8 x i32 > @four_way_i8_i32_vl128_double_width (ptr %accptr , ptr %uptr , ptr %sptr ) {
438559;
439560; COMMON-LABEL: four_way_i8_i32_vl128_double_width:
0 commit comments