@@ -530,3 +530,76 @@ entry:
530530 %sum = tail call i32 @llvm.vector.reduce.add.v16i32 (<16 x i32 > %add )
531531 ret i32 %sum
532532}
533+
534+
535+ define <4 x i32 > @vqdot_vv_partial_reduce (<16 x i8 > %a , <16 x i8 > %b ) {
536+ ; CHECK-LABEL: vqdot_vv_partial_reduce:
537+ ; CHECK: # %bb.0: # %entry
538+ ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
539+ ; CHECK-NEXT: vsext.vf2 v12, v8
540+ ; CHECK-NEXT: vsext.vf2 v14, v9
541+ ; CHECK-NEXT: vwmul.vv v8, v12, v14
542+ ; CHECK-NEXT: vsetivli zero, 4, e32, m4, ta, ma
543+ ; CHECK-NEXT: vslidedown.vi v12, v8, 12
544+ ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
545+ ; CHECK-NEXT: vadd.vv v16, v12, v8
546+ ; CHECK-NEXT: vsetivli zero, 4, e32, m4, ta, ma
547+ ; CHECK-NEXT: vslidedown.vi v12, v8, 8
548+ ; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma
549+ ; CHECK-NEXT: vslidedown.vi v8, v8, 4
550+ ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
551+ ; CHECK-NEXT: vadd.vv v8, v8, v12
552+ ; CHECK-NEXT: vadd.vv v8, v8, v16
553+ ; CHECK-NEXT: ret
554+ entry:
555+ %a.sext = sext <16 x i8 > %a to <16 x i32 >
556+ %b.sext = sext <16 x i8 > %b to <16 x i32 >
557+ %mul = mul nuw nsw <16 x i32 > %a.sext , %b.sext
558+ %res = call <4 x i32 > @llvm.experimental.vector.partial.reduce.add (<4 x i32 > zeroinitializer , <16 x i32 > %mul )
559+ ret <4 x i32 > %res
560+ }
561+
562+ define <4 x i32 > @vqdot_vv_partial_reduce2 (<16 x i8 > %a , <16 x i8 > %b , <4 x i32 > %accum ) {
563+ ; CHECK-LABEL: vqdot_vv_partial_reduce2:
564+ ; CHECK: # %bb.0: # %entry
565+ ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
566+ ; CHECK-NEXT: vsext.vf2 v16, v8
567+ ; CHECK-NEXT: vsext.vf2 v18, v9
568+ ; CHECK-NEXT: vwmul.vv v12, v16, v18
569+ ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
570+ ; CHECK-NEXT: vadd.vv v16, v10, v12
571+ ; CHECK-NEXT: vsetivli zero, 4, e32, m4, ta, ma
572+ ; CHECK-NEXT: vslidedown.vi v8, v12, 12
573+ ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
574+ ; CHECK-NEXT: vadd.vv v16, v8, v16
575+ ; CHECK-NEXT: vsetivli zero, 4, e32, m4, ta, ma
576+ ; CHECK-NEXT: vslidedown.vi v8, v12, 8
577+ ; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma
578+ ; CHECK-NEXT: vslidedown.vi v10, v12, 4
579+ ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
580+ ; CHECK-NEXT: vadd.vv v8, v10, v8
581+ ; CHECK-NEXT: vadd.vv v8, v8, v16
582+ ; CHECK-NEXT: ret
583+ entry:
584+ %a.sext = sext <16 x i8 > %a to <16 x i32 >
585+ %b.sext = sext <16 x i8 > %b to <16 x i32 >
586+ %mul = mul nuw nsw <16 x i32 > %a.sext , %b.sext
587+ %res = call <4 x i32 > @llvm.experimental.vector.partial.reduce.add (<4 x i32 > %accum , <16 x i32 > %mul )
588+ ret <4 x i32 > %res
589+ }
590+
591+ define <16 x i32 > @vqdot_vv_partial_reduce3 (<16 x i8 > %a , <16 x i8 > %b ) {
592+ ; CHECK-LABEL: vqdot_vv_partial_reduce3:
593+ ; CHECK: # %bb.0: # %entry
594+ ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
595+ ; CHECK-NEXT: vsext.vf2 v12, v8
596+ ; CHECK-NEXT: vsext.vf2 v14, v9
597+ ; CHECK-NEXT: vwmul.vv v8, v12, v14
598+ ; CHECK-NEXT: ret
599+ entry:
600+ %a.sext = sext <16 x i8 > %a to <16 x i32 >
601+ %b.sext = sext <16 x i8 > %b to <16 x i32 >
602+ %mul = mul nuw nsw <16 x i32 > %a.sext , %b.sext
603+ %res = call <16 x i32 > @llvm.experimental.vector.partial.reduce.add.nvx8i32.nvx16i32.nvx16i32 (<16 x i32 > %mul , <16 x i32 > zeroinitializer )
604+ ret <16 x i32 > %res
605+ }
0 commit comments