@@ -559,3 +559,124 @@ exit: ; preds = %for.body
559
559
%add.lcssa = phi i32 [ %add , %for.body ]
560
560
ret i32 %add.lcssa
561
561
}
562
+
563
+ ; Make sure that if there are several reductions in the loop, the order of invariant stores sank outside of the loop is preserved
564
+ ; FIXME: This tests currently shows incorrect behavior and it will fixed in the following patch
565
+ ; See https://github.com/llvm/llvm-project/issues/64047
566
+ define void @reduc_add_mul_store_same_ptr (ptr %dst , ptr readonly %src ) {
567
+ ; CHECK-LABEL: define void @reduc_add_mul_store_same_ptr
568
+ ; CHECK: middle.block:
569
+ ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
570
+ ; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
571
+ ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
572
+ ; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4
573
+ ;
574
+ entry:
575
+ br label %for.body
576
+
577
+ for.body:
578
+ %sum = phi i32 [ 0 , %entry ], [ %sum.next , %for.body ]
579
+ %mul = phi i32 [ 1 , %entry ], [ %mul.next , %for.body ]
580
+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %for.body ]
581
+ %gep.src = getelementptr inbounds i32 , ptr %src , i64 %iv
582
+ %0 = load i32 , ptr %gep.src , align 4
583
+ %sum.next = add nsw i32 %sum , %0
584
+ store i32 %sum.next , ptr %dst , align 4
585
+ %mul.next = mul nsw i32 %mul , %0
586
+ store i32 %mul.next , ptr %dst , align 4
587
+ %iv.next = add nuw nsw i64 %iv , 1
588
+ %exitcond = icmp eq i64 %iv.next , 1000
589
+ br i1 %exitcond , label %exit , label %for.body
590
+
591
+ exit:
592
+ ret void
593
+ }
594
+
595
+ define void @reduc_mul_add_store_same_ptr (ptr %dst , ptr readonly %src ) {
596
+ ; CHECK-LABEL: define void @reduc_mul_add_store_same_ptr
597
+ ; CHECK: middle.block:
598
+ ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
599
+ ; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
600
+ ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
601
+ ; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4
602
+ ;
603
+ entry:
604
+ br label %for.body
605
+
606
+ for.body:
607
+ %sum = phi i32 [ 0 , %entry ], [ %sum.next , %for.body ]
608
+ %mul = phi i32 [ 1 , %entry ], [ %mul.next , %for.body ]
609
+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %for.body ]
610
+ %gep.src = getelementptr inbounds i32 , ptr %src , i64 %iv
611
+ %0 = load i32 , ptr %gep.src , align 4
612
+ %mul.next = mul nsw i32 %mul , %0
613
+ store i32 %mul.next , ptr %dst , align 4
614
+ %sum.next = add nsw i32 %sum , %0
615
+ store i32 %sum.next , ptr %dst , align 4
616
+ %iv.next = add nuw nsw i64 %iv , 1
617
+ %exitcond = icmp eq i64 %iv.next , 1000
618
+ br i1 %exitcond , label %exit , label %for.body
619
+
620
+ exit:
621
+ ret void
622
+ }
623
+
624
+ ; Same as above but storing is done to two different pointers and they can be aliased
625
+ ; FIXME: This tests currently shows incorrect behavior and it will fixed in the following patch
626
+ define void @reduc_add_mul_store_different_ptr (ptr %dst1 , ptr %dst2 , ptr readonly %src ) {
627
+ ; CHECK-LABEL: define void @reduc_add_mul_store_different_ptr
628
+ ; CHECK: middle.block:
629
+ ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
630
+ ; CHECK-NEXT: store i32 [[TMP2]], ptr %dst2, align 4
631
+ ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
632
+ ; CHECK-NEXT: store i32 [[TMP4]], ptr %dst1, align 4
633
+ ;
634
+ entry:
635
+ br label %for.body
636
+
637
+ for.body:
638
+ %sum = phi i32 [ 0 , %entry ], [ %sum.next , %for.body ]
639
+ %mul = phi i32 [ 1 , %entry ], [ %mul.next , %for.body ]
640
+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %for.body ]
641
+ %gep.src = getelementptr inbounds i32 , ptr %src , i64 %iv
642
+ %0 = load i32 , ptr %gep.src , align 4
643
+ %sum.next = add nsw i32 %sum , %0
644
+ store i32 %sum.next , ptr %dst1 , align 4
645
+ %mul.next = mul nsw i32 %mul , %0
646
+ store i32 %mul.next , ptr %dst2 , align 4
647
+ %iv.next = add nuw nsw i64 %iv , 1
648
+ %exitcond = icmp eq i64 %iv.next , 1000
649
+ br i1 %exitcond , label %exit , label %for.body
650
+
651
+ exit:
652
+ ret void
653
+ }
654
+
655
+ define void @reduc_mul_add_store_different_ptr (ptr %dst1 , ptr %dst2 , ptr readonly %src ) {
656
+ ; CHECK-LABEL: define void @reduc_mul_add_store_different_ptr
657
+ ; CHECK: middle.block:
658
+ ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
659
+ ; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4
660
+ ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
661
+ ; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4
662
+ ;
663
+ entry:
664
+ br label %for.body
665
+
666
+ for.body:
667
+ %sum = phi i32 [ 0 , %entry ], [ %sum.next , %for.body ]
668
+ %mul = phi i32 [ 1 , %entry ], [ %mul.next , %for.body ]
669
+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %for.body ]
670
+ %gep.src = getelementptr inbounds i32 , ptr %src , i64 %iv
671
+ %0 = load i32 , ptr %gep.src , align 4
672
+ %mul.next = mul nsw i32 %mul , %0
673
+ store i32 %mul.next , ptr %dst1 , align 4
674
+ %sum.next = add nsw i32 %sum , %0
675
+ store i32 %sum.next , ptr %dst2 , align 4
676
+ %iv.next = add nuw nsw i64 %iv , 1
677
+ %exitcond = icmp eq i64 %iv.next , 1000
678
+ br i1 %exitcond , label %exit , label %for.body
679
+
680
+ exit:
681
+ ret void
682
+ }
0 commit comments