Skip to content

Commit 5056050

Browse files
author
git apple-llvm automerger
committed
Merge commit '04d10f1d13f6' from llvm.org/main into next
2 parents 37a9fa1 + 04d10f1 commit 5056050

File tree

1 file changed

+271
-6
lines changed

1 file changed

+271
-6
lines changed

llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll

Lines changed: 271 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -523,8 +523,53 @@ entry:
523523
}
524524

525525

526-
define <vscale x 4 x i32> @vqdot_vv_partial_reduce(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
527-
; CHECK-LABEL: vqdot_vv_partial_reduce:
526+
define <vscale x 1 x i32> @partial_reduce_nf2(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) {
527+
; CHECK-LABEL: partial_reduce_nf2:
528+
; CHECK: # %bb.0: # %entry
529+
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
530+
; CHECK-NEXT: vsext.vf2 v10, v8
531+
; CHECK-NEXT: vsext.vf2 v11, v9
532+
; CHECK-NEXT: csrr a0, vlenb
533+
; CHECK-NEXT: vwmul.vv v8, v10, v11
534+
; CHECK-NEXT: srli a0, a0, 3
535+
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
536+
; CHECK-NEXT: vslidedown.vx v10, v9, a0
537+
; CHECK-NEXT: vslidedown.vx v11, v8, a0
538+
; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
539+
; CHECK-NEXT: vadd.vv v8, v10, v8
540+
; CHECK-NEXT: vadd.vv v9, v11, v9
541+
; CHECK-NEXT: vadd.vv v8, v9, v8
542+
; CHECK-NEXT: ret
543+
entry:
544+
%a.sext = sext <vscale x 4 x i8> %a to <vscale x 4 x i32>
545+
%b.sext = sext <vscale x 4 x i8> %b to <vscale x 4 x i32>
546+
%mul = mul nuw nsw <vscale x 4 x i32> %a.sext, %b.sext
547+
%res = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 1 x i32> zeroinitializer, <vscale x 4 x i32> %mul)
548+
ret <vscale x 1 x i32> %res
549+
}
550+
551+
define <vscale x 2 x i32> @partial_reduce_m1(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
552+
; CHECK-LABEL: partial_reduce_m1:
553+
; CHECK: # %bb.0: # %entry
554+
; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
555+
; CHECK-NEXT: vsext.vf2 v12, v8
556+
; CHECK-NEXT: vsext.vf2 v14, v9
557+
; CHECK-NEXT: vwmul.vv v8, v12, v14
558+
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
559+
; CHECK-NEXT: vadd.vv v8, v11, v8
560+
; CHECK-NEXT: vadd.vv v9, v9, v10
561+
; CHECK-NEXT: vadd.vv v8, v9, v8
562+
; CHECK-NEXT: ret
563+
entry:
564+
%a.sext = sext <vscale x 8 x i8> %a to <vscale x 8 x i32>
565+
%b.sext = sext <vscale x 8 x i8> %b to <vscale x 8 x i32>
566+
%mul = mul nuw nsw <vscale x 8 x i32> %a.sext, %b.sext
567+
%res = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 2 x i32> zeroinitializer, <vscale x 8 x i32> %mul)
568+
ret <vscale x 2 x i32> %res
569+
}
570+
571+
define <vscale x 4 x i32> @partial_reduce_m2(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
572+
; CHECK-LABEL: partial_reduce_m2:
528573
; CHECK: # %bb.0: # %entry
529574
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
530575
; CHECK-NEXT: vsext.vf2 v16, v8
@@ -543,8 +588,178 @@ entry:
543588
ret <vscale x 4 x i32> %res
544589
}
545590

546-
define <vscale x 4 x i32> @vqdot_vv_partial_reduce2(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 4 x i32> %accum) {
547-
; CHECK-LABEL: vqdot_vv_partial_reduce2:
591+
define <vscale x 8 x i32> @partial_reduce_m4(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
592+
; CHECK-LABEL: partial_reduce_m4:
593+
; CHECK: # %bb.0: # %entry
594+
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
595+
; CHECK-NEXT: vsext.vf2 v24, v8
596+
; CHECK-NEXT: vsext.vf2 v16, v10
597+
; CHECK-NEXT: vsext.vf2 v28, v12
598+
; CHECK-NEXT: vsext.vf2 v20, v14
599+
; CHECK-NEXT: vwmul.vv v8, v16, v20
600+
; CHECK-NEXT: vwmul.vv v16, v24, v28
601+
; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma
602+
; CHECK-NEXT: vadd.vv v16, v20, v16
603+
; CHECK-NEXT: vadd.vv v8, v12, v8
604+
; CHECK-NEXT: vadd.vv v8, v8, v16
605+
; CHECK-NEXT: ret
606+
entry:
607+
%a.sext = sext <vscale x 32 x i8> %a to <vscale x 32 x i32>
608+
%b.sext = sext <vscale x 32 x i8> %b to <vscale x 32 x i32>
609+
%mul = mul nuw nsw <vscale x 32 x i32> %a.sext, %b.sext
610+
%res = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 8 x i32> zeroinitializer, <vscale x 32 x i32> %mul)
611+
ret <vscale x 8 x i32> %res
612+
}
613+
614+
define <vscale x 16 x i32> @partial_reduce_m8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b) {
615+
; CHECK-LABEL: partial_reduce_m8:
616+
; CHECK: # %bb.0: # %entry
617+
; CHECK-NEXT: addi sp, sp, -16
618+
; CHECK-NEXT: .cfi_def_cfa_offset 16
619+
; CHECK-NEXT: csrr a0, vlenb
620+
; CHECK-NEXT: slli a0, a0, 2
621+
; CHECK-NEXT: sub sp, sp, a0
622+
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
623+
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
624+
; CHECK-NEXT: vsext.vf2 v24, v10
625+
; CHECK-NEXT: addi a0, sp, 16
626+
; CHECK-NEXT: vs4r.v v24, (a0) # vscale x 32-byte Folded Spill
627+
; CHECK-NEXT: vsext.vf2 v0, v8
628+
; CHECK-NEXT: vsext.vf2 v8, v18
629+
; CHECK-NEXT: vsext.vf2 v4, v16
630+
; CHECK-NEXT: vwmul.vv v24, v0, v4
631+
; CHECK-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
632+
; CHECK-NEXT: vwmacc.vv v24, v16, v8
633+
; CHECK-NEXT: vsext.vf2 v8, v12
634+
; CHECK-NEXT: vsext.vf2 v16, v20
635+
; CHECK-NEXT: vwmacc.vv v24, v8, v16
636+
; CHECK-NEXT: vsext.vf2 v8, v14
637+
; CHECK-NEXT: vsext.vf2 v12, v22
638+
; CHECK-NEXT: vwmacc.vv v24, v8, v12
639+
; CHECK-NEXT: vmv8r.v v8, v24
640+
; CHECK-NEXT: csrr a0, vlenb
641+
; CHECK-NEXT: slli a0, a0, 2
642+
; CHECK-NEXT: add sp, sp, a0
643+
; CHECK-NEXT: .cfi_def_cfa sp, 16
644+
; CHECK-NEXT: addi sp, sp, 16
645+
; CHECK-NEXT: .cfi_def_cfa_offset 0
646+
; CHECK-NEXT: ret
647+
entry:
648+
%a.sext = sext <vscale x 64 x i8> %a to <vscale x 64 x i32>
649+
%b.sext = sext <vscale x 64 x i8> %b to <vscale x 64 x i32>
650+
%mul = mul nuw nsw <vscale x 64 x i32> %a.sext, %b.sext
651+
%res = call <vscale x 16 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 16 x i32> zeroinitializer, <vscale x 64 x i32> %mul)
652+
ret <vscale x 16 x i32> %res
653+
}
654+
655+
define <vscale x 32 x i32> @partial_reduce_m16(<vscale x 128 x i8> %a, <vscale x 128 x i8> %b) {
656+
; CHECK-LABEL: partial_reduce_m16:
657+
; CHECK: # %bb.0: # %entry
658+
; CHECK-NEXT: addi sp, sp, -16
659+
; CHECK-NEXT: .cfi_def_cfa_offset 16
660+
; CHECK-NEXT: csrr a1, vlenb
661+
; CHECK-NEXT: slli a1, a1, 3
662+
; CHECK-NEXT: mv a2, a1
663+
; CHECK-NEXT: slli a1, a1, 1
664+
; CHECK-NEXT: add a1, a1, a2
665+
; CHECK-NEXT: sub sp, sp, a1
666+
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
667+
; CHECK-NEXT: csrr a1, vlenb
668+
; CHECK-NEXT: slli a1, a1, 4
669+
; CHECK-NEXT: add a1, sp, a1
670+
; CHECK-NEXT: addi a1, a1, 16
671+
; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
672+
; CHECK-NEXT: addi a1, sp, 16
673+
; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
674+
; CHECK-NEXT: vl8r.v v16, (a0)
675+
; CHECK-NEXT: csrr a1, vlenb
676+
; CHECK-NEXT: slli a1, a1, 3
677+
; CHECK-NEXT: add a1, sp, a1
678+
; CHECK-NEXT: addi a1, a1, 16
679+
; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
680+
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
681+
; CHECK-NEXT: vsext.vf2 v4, v8
682+
; CHECK-NEXT: vsext.vf2 v0, v16
683+
; CHECK-NEXT: vwmul.vv v24, v4, v0
684+
; CHECK-NEXT: vsext.vf2 v4, v10
685+
; CHECK-NEXT: vsext.vf2 v8, v18
686+
; CHECK-NEXT: vwmacc.vv v24, v4, v8
687+
; CHECK-NEXT: csrr a1, vlenb
688+
; CHECK-NEXT: slli a1, a1, 3
689+
; CHECK-NEXT: add a0, a0, a1
690+
; CHECK-NEXT: vsext.vf2 v0, v12
691+
; CHECK-NEXT: vl8r.v v8, (a0)
692+
; CHECK-NEXT: csrr a0, vlenb
693+
; CHECK-NEXT: slli a0, a0, 3
694+
; CHECK-NEXT: add a0, sp, a0
695+
; CHECK-NEXT: addi a0, a0, 16
696+
; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
697+
; CHECK-NEXT: vsext.vf2 v4, v20
698+
; CHECK-NEXT: vwmacc.vv v24, v0, v4
699+
; CHECK-NEXT: csrr a0, vlenb
700+
; CHECK-NEXT: slli a0, a0, 4
701+
; CHECK-NEXT: add a0, sp, a0
702+
; CHECK-NEXT: addi a0, a0, 16
703+
; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
704+
; CHECK-NEXT: vsext.vf2 v20, v0
705+
; CHECK-NEXT: vsext.vf2 v16, v8
706+
; CHECK-NEXT: vwmul.vv v0, v20, v16
707+
; CHECK-NEXT: csrr a0, vlenb
708+
; CHECK-NEXT: slli a0, a0, 4
709+
; CHECK-NEXT: add a0, sp, a0
710+
; CHECK-NEXT: addi a0, a0, 16
711+
; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
712+
; CHECK-NEXT: vsext.vf2 v20, v18
713+
; CHECK-NEXT: vsext.vf2 v16, v10
714+
; CHECK-NEXT: vwmacc.vv v0, v20, v16
715+
; CHECK-NEXT: csrr a0, vlenb
716+
; CHECK-NEXT: slli a0, a0, 4
717+
; CHECK-NEXT: add a0, sp, a0
718+
; CHECK-NEXT: addi a0, a0, 16
719+
; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
720+
; CHECK-NEXT: vsext.vf2 v8, v20
721+
; CHECK-NEXT: vsext.vf2 v16, v12
722+
; CHECK-NEXT: vwmacc.vv v0, v8, v16
723+
; CHECK-NEXT: csrr a0, vlenb
724+
; CHECK-NEXT: slli a0, a0, 4
725+
; CHECK-NEXT: add a0, sp, a0
726+
; CHECK-NEXT: addi a0, a0, 16
727+
; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
728+
; CHECK-NEXT: vsext.vf2 v8, v22
729+
; CHECK-NEXT: vsext.vf2 v16, v14
730+
; CHECK-NEXT: vwmacc.vv v0, v8, v16
731+
; CHECK-NEXT: addi a0, sp, 16
732+
; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
733+
; CHECK-NEXT: vsext.vf2 v8, v14
734+
; CHECK-NEXT: csrr a0, vlenb
735+
; CHECK-NEXT: slli a0, a0, 3
736+
; CHECK-NEXT: add a0, sp, a0
737+
; CHECK-NEXT: addi a0, a0, 16
738+
; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
739+
; CHECK-NEXT: vsext.vf2 v12, v22
740+
; CHECK-NEXT: vwmacc.vv v24, v8, v12
741+
; CHECK-NEXT: vmv8r.v v8, v24
742+
; CHECK-NEXT: vmv8r.v v16, v0
743+
; CHECK-NEXT: csrr a0, vlenb
744+
; CHECK-NEXT: slli a0, a0, 3
745+
; CHECK-NEXT: mv a1, a0
746+
; CHECK-NEXT: slli a0, a0, 1
747+
; CHECK-NEXT: add a0, a0, a1
748+
; CHECK-NEXT: add sp, sp, a0
749+
; CHECK-NEXT: .cfi_def_cfa sp, 16
750+
; CHECK-NEXT: addi sp, sp, 16
751+
; CHECK-NEXT: .cfi_def_cfa_offset 0
752+
; CHECK-NEXT: ret
753+
entry:
754+
%a.sext = sext <vscale x 128 x i8> %a to <vscale x 128 x i32>
755+
%b.sext = sext <vscale x 128 x i8> %b to <vscale x 128 x i32>
756+
%mul = mul nuw nsw <vscale x 128 x i32> %a.sext, %b.sext
757+
%res = call <vscale x 32 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 32 x i32> zeroinitializer, <vscale x 128 x i32> %mul)
758+
ret <vscale x 32 x i32> %res
759+
}
760+
761+
define <vscale x 4 x i32> @partial_reduce_accum(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 4 x i32> %accum) {
762+
; CHECK-LABEL: partial_reduce_accum:
548763
; CHECK: # %bb.0: # %entry
549764
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
550765
; CHECK-NEXT: vsext.vf2 v24, v8
@@ -564,8 +779,8 @@ entry:
564779
ret <vscale x 4 x i32> %res
565780
}
566781

567-
define <vscale x 16 x i32> @vqdot_vv_partial_reduce3(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
568-
; CHECK-LABEL: vqdot_vv_partial_reduce3:
782+
define <vscale x 16 x i32> @partial_reduce_via_accum(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
783+
; CHECK-LABEL: partial_reduce_via_accum:
569784
; CHECK: # %bb.0: # %entry
570785
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
571786
; CHECK-NEXT: vsext.vf2 v16, v8
@@ -579,3 +794,53 @@ entry:
579794
%res = call <vscale x 16 x i32> @llvm.experimental.vector.partial.reduce.add.nvx16i32.nvx16i32(<vscale x 16 x i32> %mul, <vscale x 16 x i32> zeroinitializer)
580795
ret <vscale x 16 x i32> %res
581796
}
797+
798+
define <vscale x 1 x i32> @partial_reduce_vqdotu(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) {
799+
; CHECK-LABEL: partial_reduce_vqdotu:
800+
; CHECK: # %bb.0: # %entry
801+
; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
802+
; CHECK-NEXT: vwmulu.vv v10, v8, v9
803+
; CHECK-NEXT: csrr a0, vlenb
804+
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
805+
; CHECK-NEXT: vzext.vf2 v8, v10
806+
; CHECK-NEXT: srli a0, a0, 3
807+
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
808+
; CHECK-NEXT: vslidedown.vx v10, v9, a0
809+
; CHECK-NEXT: vslidedown.vx v11, v8, a0
810+
; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
811+
; CHECK-NEXT: vadd.vv v8, v10, v8
812+
; CHECK-NEXT: vadd.vv v9, v11, v9
813+
; CHECK-NEXT: vadd.vv v8, v9, v8
814+
; CHECK-NEXT: ret
815+
entry:
816+
%a.sext = zext <vscale x 4 x i8> %a to <vscale x 4 x i32>
817+
%b.sext = zext <vscale x 4 x i8> %b to <vscale x 4 x i32>
818+
%mul = mul nuw nsw <vscale x 4 x i32> %a.sext, %b.sext
819+
%res = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 1 x i32> zeroinitializer, <vscale x 4 x i32> %mul)
820+
ret <vscale x 1 x i32> %res
821+
}
822+
823+
define <vscale x 1 x i32> @partial_reduce_vqdotsu(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) {
824+
; CHECK-LABEL: partial_reduce_vqdotsu:
825+
; CHECK: # %bb.0: # %entry
826+
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
827+
; CHECK-NEXT: vsext.vf2 v10, v8
828+
; CHECK-NEXT: vzext.vf2 v11, v9
829+
; CHECK-NEXT: csrr a0, vlenb
830+
; CHECK-NEXT: vwmulsu.vv v8, v10, v11
831+
; CHECK-NEXT: srli a0, a0, 3
832+
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
833+
; CHECK-NEXT: vslidedown.vx v10, v9, a0
834+
; CHECK-NEXT: vslidedown.vx v11, v8, a0
835+
; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
836+
; CHECK-NEXT: vadd.vv v8, v10, v8
837+
; CHECK-NEXT: vadd.vv v9, v11, v9
838+
; CHECK-NEXT: vadd.vv v8, v9, v8
839+
; CHECK-NEXT: ret
840+
entry:
841+
%a.sext = sext <vscale x 4 x i8> %a to <vscale x 4 x i32>
842+
%b.sext = zext <vscale x 4 x i8> %b to <vscale x 4 x i32>
843+
%mul = mul nuw nsw <vscale x 4 x i32> %a.sext, %b.sext
844+
%res = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 1 x i32> zeroinitializer, <vscale x 4 x i32> %mul)
845+
ret <vscale x 1 x i32> %res
846+
}

0 commit comments

Comments
 (0)