@@ -297,3 +297,186 @@ entry:
297297 %res = tail call i32 @llvm.vector.reduce.add.v16i32 (<16 x i32 > %a.ext )
298298 ret i32 %res
299299}
300+
301+ define i32 @vqdot_vv_accum (<16 x i8 > %a , <16 x i8 > %b , <16 x i32 > %x ) {
302+ ; CHECK-LABEL: vqdot_vv_accum:
303+ ; CHECK: # %bb.0: # %entry
304+ ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
305+ ; CHECK-NEXT: vsext.vf2 v10, v8
306+ ; CHECK-NEXT: vsext.vf2 v16, v9
307+ ; CHECK-NEXT: vwmacc.vv v12, v10, v16
308+ ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
309+ ; CHECK-NEXT: vmv.s.x v8, zero
310+ ; CHECK-NEXT: vredsum.vs v8, v12, v8
311+ ; CHECK-NEXT: vmv.x.s a0, v8
312+ ; CHECK-NEXT: ret
313+ entry:
314+ %a.sext = sext <16 x i8 > %a to <16 x i32 >
315+ %b.sext = sext <16 x i8 > %b to <16 x i32 >
316+ %mul = mul nuw nsw <16 x i32 > %a.sext , %b.sext
317+ %add = add <16 x i32 > %mul , %x
318+ %sum = tail call i32 @llvm.vector.reduce.add.v16i32 (<16 x i32 > %add )
319+ ret i32 %sum
320+ }
321+
322+ define i32 @vqdotu_vv_accum (<16 x i8 > %a , <16 x i8 > %b , <16 x i32 > %x ) {
323+ ; CHECK-LABEL: vqdotu_vv_accum:
324+ ; CHECK: # %bb.0: # %entry
325+ ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
326+ ; CHECK-NEXT: vwmulu.vv v10, v8, v9
327+ ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
328+ ; CHECK-NEXT: vwaddu.wv v12, v12, v10
329+ ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
330+ ; CHECK-NEXT: vmv.s.x v8, zero
331+ ; CHECK-NEXT: vredsum.vs v8, v12, v8
332+ ; CHECK-NEXT: vmv.x.s a0, v8
333+ ; CHECK-NEXT: ret
334+ entry:
335+ %a.zext = zext <16 x i8 > %a to <16 x i32 >
336+ %b.zext = zext <16 x i8 > %b to <16 x i32 >
337+ %mul = mul nuw nsw <16 x i32 > %a.zext , %b.zext
338+ %add = add <16 x i32 > %mul , %x
339+ %sum = tail call i32 @llvm.vector.reduce.add.v16i32 (<16 x i32 > %add )
340+ ret i32 %sum
341+ }
342+
343+ define i32 @vqdotsu_vv_accum (<16 x i8 > %a , <16 x i8 > %b , <16 x i32 > %x ) {
344+ ; CHECK-LABEL: vqdotsu_vv_accum:
345+ ; CHECK: # %bb.0: # %entry
346+ ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
347+ ; CHECK-NEXT: vsext.vf2 v10, v8
348+ ; CHECK-NEXT: vzext.vf2 v16, v9
349+ ; CHECK-NEXT: vwmaccsu.vv v12, v10, v16
350+ ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
351+ ; CHECK-NEXT: vmv.s.x v8, zero
352+ ; CHECK-NEXT: vredsum.vs v8, v12, v8
353+ ; CHECK-NEXT: vmv.x.s a0, v8
354+ ; CHECK-NEXT: ret
355+ entry:
356+ %a.sext = sext <16 x i8 > %a to <16 x i32 >
357+ %b.zext = zext <16 x i8 > %b to <16 x i32 >
358+ %mul = mul nuw nsw <16 x i32 > %a.sext , %b.zext
359+ %add = add <16 x i32 > %mul , %x
360+ %sum = tail call i32 @llvm.vector.reduce.add.v16i32 (<16 x i32 > %add )
361+ ret i32 %sum
362+ }
363+
364+ define i32 @vqdot_vv_scalar_add (<16 x i8 > %a , <16 x i8 > %b , i32 %x ) {
365+ ; NODOT-LABEL: vqdot_vv_scalar_add:
366+ ; NODOT: # %bb.0: # %entry
367+ ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma
368+ ; NODOT-NEXT: vsext.vf2 v12, v8
369+ ; NODOT-NEXT: vsext.vf2 v14, v9
370+ ; NODOT-NEXT: vwmul.vv v8, v12, v14
371+ ; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
372+ ; NODOT-NEXT: vmv.s.x v12, a0
373+ ; NODOT-NEXT: vredsum.vs v8, v8, v12
374+ ; NODOT-NEXT: vmv.x.s a0, v8
375+ ; NODOT-NEXT: ret
376+ ;
377+ ; DOT-LABEL: vqdot_vv_scalar_add:
378+ ; DOT: # %bb.0: # %entry
379+ ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma
380+ ; DOT-NEXT: vmv.v.i v10, 0
381+ ; DOT-NEXT: vqdot.vv v10, v8, v9
382+ ; DOT-NEXT: vmv.s.x v8, a0
383+ ; DOT-NEXT: vredsum.vs v8, v10, v8
384+ ; DOT-NEXT: vmv.x.s a0, v8
385+ ; DOT-NEXT: ret
386+ entry:
387+ %a.sext = sext <16 x i8 > %a to <16 x i32 >
388+ %b.sext = sext <16 x i8 > %b to <16 x i32 >
389+ %mul = mul nuw nsw <16 x i32 > %a.sext , %b.sext
390+ %sum = tail call i32 @llvm.vector.reduce.add.v16i32 (<16 x i32 > %mul )
391+ %add = add i32 %sum , %x
392+ ret i32 %add
393+ }
394+
395+ define i32 @vqdotu_vv_scalar_add (<16 x i8 > %a , <16 x i8 > %b , i32 %x ) {
396+ ; NODOT-LABEL: vqdotu_vv_scalar_add:
397+ ; NODOT: # %bb.0: # %entry
398+ ; NODOT-NEXT: vsetivli zero, 16, e8, m1, ta, ma
399+ ; NODOT-NEXT: vwmulu.vv v10, v8, v9
400+ ; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
401+ ; NODOT-NEXT: vmv.s.x v8, a0
402+ ; NODOT-NEXT: vsetvli zero, zero, e16, m2, ta, ma
403+ ; NODOT-NEXT: vwredsumu.vs v8, v10, v8
404+ ; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
405+ ; NODOT-NEXT: vmv.x.s a0, v8
406+ ; NODOT-NEXT: ret
407+ ;
408+ ; DOT-LABEL: vqdotu_vv_scalar_add:
409+ ; DOT: # %bb.0: # %entry
410+ ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma
411+ ; DOT-NEXT: vmv.v.i v10, 0
412+ ; DOT-NEXT: vqdotu.vv v10, v8, v9
413+ ; DOT-NEXT: vmv.s.x v8, a0
414+ ; DOT-NEXT: vredsum.vs v8, v10, v8
415+ ; DOT-NEXT: vmv.x.s a0, v8
416+ ; DOT-NEXT: ret
417+ entry:
418+ %a.zext = zext <16 x i8 > %a to <16 x i32 >
419+ %b.zext = zext <16 x i8 > %b to <16 x i32 >
420+ %mul = mul nuw nsw <16 x i32 > %a.zext , %b.zext
421+ %sum = tail call i32 @llvm.vector.reduce.add.v16i32 (<16 x i32 > %mul )
422+ %add = add i32 %sum , %x
423+ ret i32 %add
424+ }
425+
426+ define i32 @vqdotsu_vv_scalar_add (<16 x i8 > %a , <16 x i8 > %b , i32 %x ) {
427+ ; NODOT-LABEL: vqdotsu_vv_scalar_add:
428+ ; NODOT: # %bb.0: # %entry
429+ ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma
430+ ; NODOT-NEXT: vsext.vf2 v12, v8
431+ ; NODOT-NEXT: vzext.vf2 v14, v9
432+ ; NODOT-NEXT: vwmulsu.vv v8, v12, v14
433+ ; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
434+ ; NODOT-NEXT: vmv.s.x v12, a0
435+ ; NODOT-NEXT: vredsum.vs v8, v8, v12
436+ ; NODOT-NEXT: vmv.x.s a0, v8
437+ ; NODOT-NEXT: ret
438+ ;
439+ ; DOT-LABEL: vqdotsu_vv_scalar_add:
440+ ; DOT: # %bb.0: # %entry
441+ ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma
442+ ; DOT-NEXT: vmv.v.i v10, 0
443+ ; DOT-NEXT: vqdotsu.vv v10, v8, v9
444+ ; DOT-NEXT: vmv.s.x v8, a0
445+ ; DOT-NEXT: vredsum.vs v8, v10, v8
446+ ; DOT-NEXT: vmv.x.s a0, v8
447+ ; DOT-NEXT: ret
448+ entry:
449+ %a.sext = sext <16 x i8 > %a to <16 x i32 >
450+ %b.zext = zext <16 x i8 > %b to <16 x i32 >
451+ %mul = mul nuw nsw <16 x i32 > %a.sext , %b.zext
452+ %sum = tail call i32 @llvm.vector.reduce.add.v16i32 (<16 x i32 > %mul )
453+ %add = add i32 %sum , %x
454+ ret i32 %add
455+ }
456+
457+ define i32 @vqdot_vv_split (<16 x i8 > %a , <16 x i8 > %b , <16 x i8 > %c , <16 x i8 > %d ) {
458+ ; CHECK-LABEL: vqdot_vv_split:
459+ ; CHECK: # %bb.0: # %entry
460+ ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
461+ ; CHECK-NEXT: vsext.vf2 v12, v8
462+ ; CHECK-NEXT: vsext.vf2 v14, v9
463+ ; CHECK-NEXT: vsext.vf2 v16, v10
464+ ; CHECK-NEXT: vsext.vf2 v18, v11
465+ ; CHECK-NEXT: vwmul.vv v8, v12, v14
466+ ; CHECK-NEXT: vwmacc.vv v8, v16, v18
467+ ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
468+ ; CHECK-NEXT: vmv.s.x v12, zero
469+ ; CHECK-NEXT: vredsum.vs v8, v8, v12
470+ ; CHECK-NEXT: vmv.x.s a0, v8
471+ ; CHECK-NEXT: ret
472+ entry:
473+ %a.sext = sext <16 x i8 > %a to <16 x i32 >
474+ %b.sext = sext <16 x i8 > %b to <16 x i32 >
475+ %mul = mul nuw nsw <16 x i32 > %a.sext , %b.sext
476+ %c.sext = sext <16 x i8 > %c to <16 x i32 >
477+ %d.sext = sext <16 x i8 > %d to <16 x i32 >
478+ %mul2 = mul nuw nsw <16 x i32 > %c.sext , %d.sext
479+ %add = add <16 x i32 > %mul , %mul2
480+ %sum = tail call i32 @llvm.vector.reduce.add.v16i32 (<16 x i32 > %add )
481+ ret i32 %sum
482+ }
0 commit comments