@@ -284,6 +284,371 @@ define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) {
284284 ret i32 %r
285285}
286286
287+
288+ define float @nested_fadd_f32 (<4 x float > %a , <4 x float > %b , float %c , float %d ) {
289+ ; CHECK-LABEL: nested_fadd_f32:
290+ ; CHECK: // %bb.0:
291+ ; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s
292+ ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
293+ ; CHECK-NEXT: faddp s1, v1.2s
294+ ; CHECK-NEXT: faddp s0, v0.2s
295+ ; CHECK-NEXT: fadd s1, s1, s3
296+ ; CHECK-NEXT: fadd s0, s0, s2
297+ ; CHECK-NEXT: fadd s0, s0, s1
298+ ; CHECK-NEXT: ret
299+ %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32 (float -0 .0 , <4 x float > %a )
300+ %a1 = fadd fast float %r1 , %c
301+ %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32 (float -0 .0 , <4 x float > %b )
302+ %a2 = fadd fast float %r2 , %d
303+ %r = fadd fast float %a1 , %a2
304+ ret float %r
305+ }
306+
307+ define float @nested_fadd_f32_slow (<4 x float > %a , <4 x float > %b , float %c , float %d ) {
308+ ; CHECK-LABEL: nested_fadd_f32_slow:
309+ ; CHECK: // %bb.0:
310+ ; CHECK-NEXT: mov s4, v1.s[2]
311+ ; CHECK-NEXT: mov s5, v0.s[2]
312+ ; CHECK-NEXT: faddp s6, v0.2s
313+ ; CHECK-NEXT: faddp s7, v1.2s
314+ ; CHECK-NEXT: mov s1, v1.s[3]
315+ ; CHECK-NEXT: mov s0, v0.s[3]
316+ ; CHECK-NEXT: fadd s5, s6, s5
317+ ; CHECK-NEXT: fadd s4, s7, s4
318+ ; CHECK-NEXT: fadd s0, s5, s0
319+ ; CHECK-NEXT: fadd s1, s4, s1
320+ ; CHECK-NEXT: fadd s0, s0, s2
321+ ; CHECK-NEXT: fadd s1, s1, s3
322+ ; CHECK-NEXT: fadd s0, s0, s1
323+ ; CHECK-NEXT: ret
324+ %r1 = call float @llvm.vector.reduce.fadd.f32.v4f32 (float -0 .0 , <4 x float > %a )
325+ %a1 = fadd float %r1 , %c
326+ %r2 = call float @llvm.vector.reduce.fadd.f32.v4f32 (float -0 .0 , <4 x float > %b )
327+ %a2 = fadd float %r2 , %d
328+ %r = fadd float %a1 , %a2
329+ ret float %r
330+ }
331+
332+ define float @nested_mul_f32 (<4 x float > %a , <4 x float > %b , float %c , float %d ) {
333+ ; CHECK-LABEL: nested_mul_f32:
334+ ; CHECK: // %bb.0:
335+ ; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #8
336+ ; CHECK-NEXT: ext v5.16b, v0.16b, v0.16b, #8
337+ ; CHECK-NEXT: fmul v1.2s, v1.2s, v4.2s
338+ ; CHECK-NEXT: fmul v0.2s, v0.2s, v5.2s
339+ ; CHECK-NEXT: fmul s1, s1, v1.s[1]
340+ ; CHECK-NEXT: fmul s0, s0, v0.s[1]
341+ ; CHECK-NEXT: fmul s1, s1, s3
342+ ; CHECK-NEXT: fmul s0, s0, s2
343+ ; CHECK-NEXT: fmul s0, s0, s1
344+ ; CHECK-NEXT: ret
345+ %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32 (float 1 .0 , <4 x float > %a )
346+ %a1 = fmul fast float %r1 , %c
347+ %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32 (float 1 .0 , <4 x float > %b )
348+ %a2 = fmul fast float %r2 , %d
349+ %r = fmul fast float %a1 , %a2
350+ ret float %r
351+ }
352+
353+ define i32 @nested_add_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
354+ ; CHECK-LABEL: nested_add_i32:
355+ ; CHECK: // %bb.0:
356+ ; CHECK-NEXT: addv s1, v1.4s
357+ ; CHECK-NEXT: addv s0, v0.4s
358+ ; CHECK-NEXT: fmov w8, s1
359+ ; CHECK-NEXT: fmov w9, s0
360+ ; CHECK-NEXT: add w9, w9, w0
361+ ; CHECK-NEXT: add w8, w8, w1
362+ ; CHECK-NEXT: add w0, w9, w8
363+ ; CHECK-NEXT: ret
364+ %r1 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %a )
365+ %a1 = add i32 %r1 , %c
366+ %r2 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %b )
367+ %a2 = add i32 %r2 , %d
368+ %r = add i32 %a1 , %a2
369+ ret i32 %r
370+ }
371+
372+ define i32 @nested_add_c1_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
373+ ; CHECK-LABEL: nested_add_c1_i32:
374+ ; CHECK: // %bb.0:
375+ ; CHECK-NEXT: addv s1, v1.4s
376+ ; CHECK-NEXT: addv s0, v0.4s
377+ ; CHECK-NEXT: fmov w8, s1
378+ ; CHECK-NEXT: fmov w9, s0
379+ ; CHECK-NEXT: add w9, w0, w9
380+ ; CHECK-NEXT: add w8, w8, w1
381+ ; CHECK-NEXT: add w0, w9, w8
382+ ; CHECK-NEXT: ret
383+ %r1 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %a )
384+ %a1 = add i32 %c , %r1
385+ %r2 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %b )
386+ %a2 = add i32 %r2 , %d
387+ %r = add i32 %a1 , %a2
388+ ret i32 %r
389+ }
390+
391+ define i32 @nested_add_c2_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
392+ ; CHECK-LABEL: nested_add_c2_i32:
393+ ; CHECK: // %bb.0:
394+ ; CHECK-NEXT: addv s1, v1.4s
395+ ; CHECK-NEXT: addv s0, v0.4s
396+ ; CHECK-NEXT: fmov w8, s1
397+ ; CHECK-NEXT: fmov w9, s0
398+ ; CHECK-NEXT: add w9, w9, w0
399+ ; CHECK-NEXT: add w8, w1, w8
400+ ; CHECK-NEXT: add w0, w9, w8
401+ ; CHECK-NEXT: ret
402+ %r1 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %a )
403+ %a1 = add i32 %r1 , %c
404+ %r2 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %b )
405+ %a2 = add i32 %d , %r2
406+ %r = add i32 %a1 , %a2
407+ ret i32 %r
408+ }
409+
410+ define i32 @nested_add_manyreduct_i32 (<4 x i32 > %a , <4 x i32 > %b , <4 x i32 > %c , <4 x i32 > %d ) {
411+ ; CHECK-LABEL: nested_add_manyreduct_i32:
412+ ; CHECK: // %bb.0:
413+ ; CHECK-NEXT: add v1.4s, v1.4s, v3.4s
414+ ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
415+ ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
416+ ; CHECK-NEXT: addv s0, v0.4s
417+ ; CHECK-NEXT: fmov w0, s0
418+ ; CHECK-NEXT: ret
419+ %r1 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %a )
420+ %r3 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %c )
421+ %a1 = add i32 %r1 , %r3
422+ %r2 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %b )
423+ %r4 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %d )
424+ %a2 = add i32 %r2 , %r4
425+ %r = add i32 %a1 , %a2
426+ ret i32 %r
427+ }
428+
429+ define i32 @nested_mul_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
430+ ; CHECK-LABEL: nested_mul_i32:
431+ ; CHECK: // %bb.0:
432+ ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
433+ ; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
434+ ; CHECK-NEXT: mul v0.2s, v0.2s, v3.2s
435+ ; CHECK-NEXT: mul v1.2s, v1.2s, v2.2s
436+ ; CHECK-NEXT: mov w8, v0.s[1]
437+ ; CHECK-NEXT: fmov w10, s0
438+ ; CHECK-NEXT: mov w9, v1.s[1]
439+ ; CHECK-NEXT: mul w8, w10, w8
440+ ; CHECK-NEXT: fmov w10, s1
441+ ; CHECK-NEXT: mul w9, w10, w9
442+ ; CHECK-NEXT: mul w8, w8, w0
443+ ; CHECK-NEXT: mul w9, w9, w1
444+ ; CHECK-NEXT: mul w0, w8, w9
445+ ; CHECK-NEXT: ret
446+ %r1 = call i32 @llvm.vector.reduce.mul.v4i32 (<4 x i32 > %a )
447+ %a1 = mul i32 %r1 , %c
448+ %r2 = call i32 @llvm.vector.reduce.mul.v4i32 (<4 x i32 > %b )
449+ %a2 = mul i32 %r2 , %d
450+ %r = mul i32 %a1 , %a2
451+ ret i32 %r
452+ }
453+
454+ define i32 @nested_and_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
455+ ; CHECK-LABEL: nested_and_i32:
456+ ; CHECK: // %bb.0:
457+ ; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
458+ ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
459+ ; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
460+ ; CHECK-NEXT: and v0.8b, v0.8b, v3.8b
461+ ; CHECK-NEXT: fmov x8, d1
462+ ; CHECK-NEXT: fmov x9, d0
463+ ; CHECK-NEXT: lsr x10, x9, #32
464+ ; CHECK-NEXT: lsr x11, x8, #32
465+ ; CHECK-NEXT: and w9, w9, w0
466+ ; CHECK-NEXT: and w8, w8, w1
467+ ; CHECK-NEXT: and w9, w9, w10
468+ ; CHECK-NEXT: and w8, w8, w11
469+ ; CHECK-NEXT: and w0, w9, w8
470+ ; CHECK-NEXT: ret
471+ %r1 = call i32 @llvm.vector.reduce.and.v4i32 (<4 x i32 > %a )
472+ %a1 = and i32 %r1 , %c
473+ %r2 = call i32 @llvm.vector.reduce.and.v4i32 (<4 x i32 > %b )
474+ %a2 = and i32 %r2 , %d
475+ %r = and i32 %a1 , %a2
476+ ret i32 %r
477+ }
478+
479+ define i32 @nested_or_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
480+ ; CHECK-LABEL: nested_or_i32:
481+ ; CHECK: // %bb.0:
482+ ; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
483+ ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
484+ ; CHECK-NEXT: orr v1.8b, v1.8b, v2.8b
485+ ; CHECK-NEXT: orr v0.8b, v0.8b, v3.8b
486+ ; CHECK-NEXT: fmov x8, d1
487+ ; CHECK-NEXT: fmov x9, d0
488+ ; CHECK-NEXT: lsr x10, x9, #32
489+ ; CHECK-NEXT: lsr x11, x8, #32
490+ ; CHECK-NEXT: orr w9, w9, w0
491+ ; CHECK-NEXT: orr w8, w8, w1
492+ ; CHECK-NEXT: orr w9, w9, w10
493+ ; CHECK-NEXT: orr w8, w8, w11
494+ ; CHECK-NEXT: orr w0, w9, w8
495+ ; CHECK-NEXT: ret
496+ %r1 = call i32 @llvm.vector.reduce.or.v4i32 (<4 x i32 > %a )
497+ %a1 = or i32 %r1 , %c
498+ %r2 = call i32 @llvm.vector.reduce.or.v4i32 (<4 x i32 > %b )
499+ %a2 = or i32 %r2 , %d
500+ %r = or i32 %a1 , %a2
501+ ret i32 %r
502+ }
503+
504+ define i32 @nested_xor_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
505+ ; CHECK-LABEL: nested_xor_i32:
506+ ; CHECK: // %bb.0:
507+ ; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
508+ ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
509+ ; CHECK-NEXT: eor v1.8b, v1.8b, v2.8b
510+ ; CHECK-NEXT: eor v0.8b, v0.8b, v3.8b
511+ ; CHECK-NEXT: fmov x8, d1
512+ ; CHECK-NEXT: fmov x9, d0
513+ ; CHECK-NEXT: lsr x10, x9, #32
514+ ; CHECK-NEXT: lsr x11, x8, #32
515+ ; CHECK-NEXT: eor w9, w9, w0
516+ ; CHECK-NEXT: eor w8, w8, w1
517+ ; CHECK-NEXT: eor w9, w9, w10
518+ ; CHECK-NEXT: eor w8, w8, w11
519+ ; CHECK-NEXT: eor w0, w9, w8
520+ ; CHECK-NEXT: ret
521+ %r1 = call i32 @llvm.vector.reduce.xor.v4i32 (<4 x i32 > %a )
522+ %a1 = xor i32 %r1 , %c
523+ %r2 = call i32 @llvm.vector.reduce.xor.v4i32 (<4 x i32 > %b )
524+ %a2 = xor i32 %r2 , %d
525+ %r = xor i32 %a1 , %a2
526+ ret i32 %r
527+ }
528+
529+ define i32 @nested_smin_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
530+ ; CHECK-LABEL: nested_smin_i32:
531+ ; CHECK: // %bb.0:
532+ ; CHECK-NEXT: sminv s0, v0.4s
533+ ; CHECK-NEXT: sminv s1, v1.4s
534+ ; CHECK-NEXT: fmov w9, s0
535+ ; CHECK-NEXT: fmov w8, s1
536+ ; CHECK-NEXT: cmp w9, w0
537+ ; CHECK-NEXT: csel w9, w9, w0, lt
538+ ; CHECK-NEXT: cmp w8, w1
539+ ; CHECK-NEXT: csel w8, w8, w1, lt
540+ ; CHECK-NEXT: cmp w9, w8
541+ ; CHECK-NEXT: csel w0, w9, w8, lt
542+ ; CHECK-NEXT: ret
543+ %r1 = call i32 @llvm.vector.reduce.smin.v4i32 (<4 x i32 > %a )
544+ %a1 = call i32 @llvm.smin.i32 (i32 %r1 , i32 %c )
545+ %r2 = call i32 @llvm.vector.reduce.smin.v4i32 (<4 x i32 > %b )
546+ %a2 = call i32 @llvm.smin.i32 (i32 %r2 , i32 %d )
547+ %r = call i32 @llvm.smin.i32 (i32 %a1 , i32 %a2 )
548+ ret i32 %r
549+ }
550+
551+ define i32 @nested_smax_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
552+ ; CHECK-LABEL: nested_smax_i32:
553+ ; CHECK: // %bb.0:
554+ ; CHECK-NEXT: smaxv s0, v0.4s
555+ ; CHECK-NEXT: smaxv s1, v1.4s
556+ ; CHECK-NEXT: fmov w9, s0
557+ ; CHECK-NEXT: fmov w8, s1
558+ ; CHECK-NEXT: cmp w9, w0
559+ ; CHECK-NEXT: csel w9, w9, w0, gt
560+ ; CHECK-NEXT: cmp w8, w1
561+ ; CHECK-NEXT: csel w8, w8, w1, gt
562+ ; CHECK-NEXT: cmp w9, w8
563+ ; CHECK-NEXT: csel w0, w9, w8, gt
564+ ; CHECK-NEXT: ret
565+ %r1 = call i32 @llvm.vector.reduce.smax.v4i32 (<4 x i32 > %a )
566+ %a1 = call i32 @llvm.smax.i32 (i32 %r1 , i32 %c )
567+ %r2 = call i32 @llvm.vector.reduce.smax.v4i32 (<4 x i32 > %b )
568+ %a2 = call i32 @llvm.smax.i32 (i32 %r2 , i32 %d )
569+ %r = call i32 @llvm.smax.i32 (i32 %a1 , i32 %a2 )
570+ ret i32 %r
571+ }
572+
573+ define i32 @nested_umin_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
574+ ; CHECK-LABEL: nested_umin_i32:
575+ ; CHECK: // %bb.0:
576+ ; CHECK-NEXT: uminv s0, v0.4s
577+ ; CHECK-NEXT: uminv s1, v1.4s
578+ ; CHECK-NEXT: fmov w9, s0
579+ ; CHECK-NEXT: fmov w8, s1
580+ ; CHECK-NEXT: cmp w9, w0
581+ ; CHECK-NEXT: csel w9, w9, w0, lo
582+ ; CHECK-NEXT: cmp w8, w1
583+ ; CHECK-NEXT: csel w8, w8, w1, lo
584+ ; CHECK-NEXT: cmp w9, w8
585+ ; CHECK-NEXT: csel w0, w9, w8, lo
586+ ; CHECK-NEXT: ret
587+ %r1 = call i32 @llvm.vector.reduce.umin.v4i32 (<4 x i32 > %a )
588+ %a1 = call i32 @llvm.umin.i32 (i32 %r1 , i32 %c )
589+ %r2 = call i32 @llvm.vector.reduce.umin.v4i32 (<4 x i32 > %b )
590+ %a2 = call i32 @llvm.umin.i32 (i32 %r2 , i32 %d )
591+ %r = call i32 @llvm.umin.i32 (i32 %a1 , i32 %a2 )
592+ ret i32 %r
593+ }
594+
595+ define i32 @nested_umax_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
596+ ; CHECK-LABEL: nested_umax_i32:
597+ ; CHECK: // %bb.0:
598+ ; CHECK-NEXT: umaxv s0, v0.4s
599+ ; CHECK-NEXT: umaxv s1, v1.4s
600+ ; CHECK-NEXT: fmov w9, s0
601+ ; CHECK-NEXT: fmov w8, s1
602+ ; CHECK-NEXT: cmp w9, w0
603+ ; CHECK-NEXT: csel w9, w9, w0, hi
604+ ; CHECK-NEXT: cmp w8, w1
605+ ; CHECK-NEXT: csel w8, w8, w1, hi
606+ ; CHECK-NEXT: cmp w9, w8
607+ ; CHECK-NEXT: csel w0, w9, w8, hi
608+ ; CHECK-NEXT: ret
609+ %r1 = call i32 @llvm.vector.reduce.umax.v4i32 (<4 x i32 > %a )
610+ %a1 = call i32 @llvm.umax.i32 (i32 %r1 , i32 %c )
611+ %r2 = call i32 @llvm.vector.reduce.umax.v4i32 (<4 x i32 > %b )
612+ %a2 = call i32 @llvm.umax.i32 (i32 %r2 , i32 %d )
613+ %r = call i32 @llvm.umax.i32 (i32 %a1 , i32 %a2 )
614+ ret i32 %r
615+ }
616+
617+ define float @nested_fmin_float (<4 x float > %a , <4 x float > %b , float %c , float %d ) {
618+ ; CHECK-LABEL: nested_fmin_float:
619+ ; CHECK: // %bb.0:
620+ ; CHECK-NEXT: fminnmv s1, v1.4s
621+ ; CHECK-NEXT: fminnmv s0, v0.4s
622+ ; CHECK-NEXT: fminnm s1, s1, s3
623+ ; CHECK-NEXT: fminnm s0, s0, s2
624+ ; CHECK-NEXT: fminnm s0, s0, s1
625+ ; CHECK-NEXT: ret
626+ %r1 = call float @llvm.vector.reduce.fmin.v4f32 (<4 x float > %a )
627+ %a1 = call float @llvm.minnum.f32 (float %r1 , float %c )
628+ %r2 = call float @llvm.vector.reduce.fmin.v4f32 (<4 x float > %b )
629+ %a2 = call float @llvm.minnum.f32 (float %r2 , float %d )
630+ %r = call float @llvm.minnum.f32 (float %a1 , float %a2 )
631+ ret float %r
632+ }
633+
634+ define float @nested_fmax_float (<4 x float > %a , <4 x float > %b , float %c , float %d ) {
635+ ; CHECK-LABEL: nested_fmax_float:
636+ ; CHECK: // %bb.0:
637+ ; CHECK-NEXT: fmaxnmv s1, v1.4s
638+ ; CHECK-NEXT: fmaxnmv s0, v0.4s
639+ ; CHECK-NEXT: fmaxnm s1, s1, s3
640+ ; CHECK-NEXT: fmaxnm s0, s0, s2
641+ ; CHECK-NEXT: fmaxnm s0, s0, s1
642+ ; CHECK-NEXT: ret
643+ %r1 = call float @llvm.vector.reduce.fmax.v4f32 (<4 x float > %a )
644+ %a1 = call float @llvm.maxnum.f32 (float %r1 , float %c )
645+ %r2 = call float @llvm.vector.reduce.fmax.v4f32 (<4 x float > %b )
646+ %a2 = call float @llvm.maxnum.f32 (float %r2 , float %d )
647+ %r = call float @llvm.maxnum.f32 (float %a1 , float %a2 )
648+ ret float %r
649+ }
650+
651+
287652declare float @llvm.vector.reduce.fadd.f32.v8f32 (float , <8 x float >)
288653declare float @llvm.vector.reduce.fadd.f32.v4f32 (float , <4 x float >)
289654declare float @llvm.vector.reduce.fmul.f32.v8f32 (float , <8 x float >)
0 commit comments