@@ -580,6 +580,201 @@ exit:
580
580
ret double %accum
581
581
}
582
582
583
+ define void @loaded_address_used_by_load_through_blend (i64 %start , ptr noalias %src , ptr noalias %src.2 , ptr noalias %dst ) #0 {
584
+ ; I64-LABEL: define void @loaded_address_used_by_load_through_blend(
585
+ ; I64-SAME: i64 [[START:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0]] {
586
+ ; I64-NEXT: [[ENTRY:.*]]:
587
+ ; I64-NEXT: br label %[[LOOP_HEADER:.*]]
588
+ ; I64: [[LOOP_HEADER]]:
589
+ ; I64-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
590
+ ; I64-NEXT: [[IV_2:%.*]] = phi i64 [ [[START]], %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP_LATCH]] ]
591
+ ; I64-NEXT: [[IV_1:%.*]] = add i64 [[IV]], 1
592
+ ; I64-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_1]]
593
+ ; I64-NEXT: [[L_SRC:%.*]] = load float, ptr [[GEP_SRC]], align 4
594
+ ; I64-NEXT: [[C:%.*]] = fcmp oeq float [[L_SRC]], 0.000000e+00
595
+ ; I64-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
596
+ ; I64: [[THEN]]:
597
+ ; I64-NEXT: [[IV_MUL:%.*]] = mul i64 [[IV_1]], [[START]]
598
+ ; I64-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[IV_MUL]]
599
+ ; I64-NEXT: br label %[[LOOP_LATCH]]
600
+ ; I64: [[LOOP_LATCH]]:
601
+ ; I64-NEXT: [[MERGE_GEP:%.*]] = phi ptr [ [[GEP_SRC_2]], %[[THEN]] ], [ [[SRC_2]], %[[LOOP_HEADER]] ]
602
+ ; I64-NEXT: [[L_2:%.*]] = load float, ptr [[MERGE_GEP]], align 4
603
+ ; I64-NEXT: [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
604
+ ; I64-NEXT: store float [[L_2]], ptr [[GEP_DST]], align 4
605
+ ; I64-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
606
+ ; I64-NEXT: [[IV_2_NEXT]] = add i64 [[IV_2]], -1
607
+ ; I64-NEXT: [[EC:%.*]] = icmp sgt i64 [[IV_2]], 100
608
+ ; I64-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT:.*]]
609
+ ; I64: [[EXIT]]:
610
+ ; I64-NEXT: ret void
611
+ ;
612
+ ; I32-LABEL: define void @loaded_address_used_by_load_through_blend(
613
+ ; I32-SAME: i64 [[START:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0]] {
614
+ ; I32-NEXT: [[ENTRY:.*:]]
615
+ ; I32-NEXT: [[TMP0:%.*]] = add i64 [[START]], 1
616
+ ; I32-NEXT: [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 [[START]], i64 100)
617
+ ; I32-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[SMIN]]
618
+ ; I32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 8
619
+ ; I32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
620
+ ; I32: [[VECTOR_PH]]:
621
+ ; I32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 8
622
+ ; I32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
623
+ ; I32-NEXT: [[TMP2:%.*]] = sub i64 [[START]], [[N_VEC]]
624
+ ; I32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[START]], i64 0
625
+ ; I32-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
626
+ ; I32-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x ptr> poison, ptr [[SRC_2]], i64 0
627
+ ; I32-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT1]], <8 x ptr> poison, <8 x i32> zeroinitializer
628
+ ; I32-NEXT: br label %[[VECTOR_BODY:.*]]
629
+ ; I32: [[VECTOR_BODY]]:
630
+ ; I32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
631
+ ; I32-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
632
+ ; I32-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1
633
+ ; I32-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 2
634
+ ; I32-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 3
635
+ ; I32-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 4
636
+ ; I32-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 5
637
+ ; I32-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 6
638
+ ; I32-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 7
639
+ ; I32-NEXT: [[TMP11:%.*]] = add i64 [[TMP3]], 1
640
+ ; I32-NEXT: [[TMP12:%.*]] = add i64 [[TMP4]], 1
641
+ ; I32-NEXT: [[TMP13:%.*]] = add i64 [[TMP5]], 1
642
+ ; I32-NEXT: [[TMP14:%.*]] = add i64 [[TMP6]], 1
643
+ ; I32-NEXT: [[TMP15:%.*]] = add i64 [[TMP7]], 1
644
+ ; I32-NEXT: [[TMP16:%.*]] = add i64 [[TMP8]], 1
645
+ ; I32-NEXT: [[TMP17:%.*]] = add i64 [[TMP9]], 1
646
+ ; I32-NEXT: [[TMP18:%.*]] = add i64 [[TMP10]], 1
647
+ ; I32-NEXT: [[TMP19:%.*]] = insertelement <8 x i64> poison, i64 [[TMP11]], i32 0
648
+ ; I32-NEXT: [[TMP20:%.*]] = insertelement <8 x i64> [[TMP19]], i64 [[TMP12]], i32 1
649
+ ; I32-NEXT: [[TMP21:%.*]] = insertelement <8 x i64> [[TMP20]], i64 [[TMP13]], i32 2
650
+ ; I32-NEXT: [[TMP22:%.*]] = insertelement <8 x i64> [[TMP21]], i64 [[TMP14]], i32 3
651
+ ; I32-NEXT: [[TMP23:%.*]] = insertelement <8 x i64> [[TMP22]], i64 [[TMP15]], i32 4
652
+ ; I32-NEXT: [[TMP24:%.*]] = insertelement <8 x i64> [[TMP23]], i64 [[TMP16]], i32 5
653
+ ; I32-NEXT: [[TMP25:%.*]] = insertelement <8 x i64> [[TMP24]], i64 [[TMP17]], i32 6
654
+ ; I32-NEXT: [[TMP26:%.*]] = insertelement <8 x i64> [[TMP25]], i64 [[TMP18]], i32 7
655
+ ; I32-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP11]]
656
+ ; I32-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP12]]
657
+ ; I32-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
658
+ ; I32-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]]
659
+ ; I32-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP15]]
660
+ ; I32-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP16]]
661
+ ; I32-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
662
+ ; I32-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP18]]
663
+ ; I32-NEXT: [[TMP35:%.*]] = load float, ptr [[TMP27]], align 4
664
+ ; I32-NEXT: [[TMP36:%.*]] = load float, ptr [[TMP28]], align 4
665
+ ; I32-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP29]], align 4
666
+ ; I32-NEXT: [[TMP38:%.*]] = load float, ptr [[TMP30]], align 4
667
+ ; I32-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP31]], align 4
668
+ ; I32-NEXT: [[TMP40:%.*]] = load float, ptr [[TMP32]], align 4
669
+ ; I32-NEXT: [[TMP41:%.*]] = load float, ptr [[TMP33]], align 4
670
+ ; I32-NEXT: [[TMP42:%.*]] = load float, ptr [[TMP34]], align 4
671
+ ; I32-NEXT: [[TMP43:%.*]] = insertelement <8 x float> poison, float [[TMP35]], i32 0
672
+ ; I32-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP36]], i32 1
673
+ ; I32-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP37]], i32 2
674
+ ; I32-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP38]], i32 3
675
+ ; I32-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP39]], i32 4
676
+ ; I32-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP40]], i32 5
677
+ ; I32-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP41]], i32 6
678
+ ; I32-NEXT: [[TMP50:%.*]] = insertelement <8 x float> [[TMP49]], float [[TMP42]], i32 7
679
+ ; I32-NEXT: [[TMP51:%.*]] = fcmp oeq <8 x float> [[TMP50]], zeroinitializer
680
+ ; I32-NEXT: [[TMP52:%.*]] = mul <8 x i64> [[TMP26]], [[BROADCAST_SPLAT]]
681
+ ; I32-NEXT: [[TMP53:%.*]] = extractelement <8 x i64> [[TMP52]], i32 0
682
+ ; I32-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP53]]
683
+ ; I32-NEXT: [[TMP55:%.*]] = extractelement <8 x i64> [[TMP52]], i32 1
684
+ ; I32-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP55]]
685
+ ; I32-NEXT: [[TMP57:%.*]] = extractelement <8 x i64> [[TMP52]], i32 2
686
+ ; I32-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP57]]
687
+ ; I32-NEXT: [[TMP59:%.*]] = extractelement <8 x i64> [[TMP52]], i32 3
688
+ ; I32-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP59]]
689
+ ; I32-NEXT: [[TMP61:%.*]] = extractelement <8 x i64> [[TMP52]], i32 4
690
+ ; I32-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP61]]
691
+ ; I32-NEXT: [[TMP63:%.*]] = extractelement <8 x i64> [[TMP52]], i32 5
692
+ ; I32-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP63]]
693
+ ; I32-NEXT: [[TMP65:%.*]] = extractelement <8 x i64> [[TMP52]], i32 6
694
+ ; I32-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP65]]
695
+ ; I32-NEXT: [[TMP67:%.*]] = extractelement <8 x i64> [[TMP52]], i32 7
696
+ ; I32-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP67]]
697
+ ; I32-NEXT: [[TMP69:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP54]], i32 0
698
+ ; I32-NEXT: [[TMP70:%.*]] = insertelement <8 x ptr> [[TMP69]], ptr [[TMP56]], i32 1
699
+ ; I32-NEXT: [[TMP71:%.*]] = insertelement <8 x ptr> [[TMP70]], ptr [[TMP58]], i32 2
700
+ ; I32-NEXT: [[TMP72:%.*]] = insertelement <8 x ptr> [[TMP71]], ptr [[TMP60]], i32 3
701
+ ; I32-NEXT: [[TMP73:%.*]] = insertelement <8 x ptr> [[TMP72]], ptr [[TMP62]], i32 4
702
+ ; I32-NEXT: [[TMP74:%.*]] = insertelement <8 x ptr> [[TMP73]], ptr [[TMP64]], i32 5
703
+ ; I32-NEXT: [[TMP75:%.*]] = insertelement <8 x ptr> [[TMP74]], ptr [[TMP66]], i32 6
704
+ ; I32-NEXT: [[TMP76:%.*]] = insertelement <8 x ptr> [[TMP75]], ptr [[TMP68]], i32 7
705
+ ; I32-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[TMP51]], <8 x ptr> [[TMP76]], <8 x ptr> [[BROADCAST_SPLAT2]]
706
+ ; I32-NEXT: [[TMP77:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 0
707
+ ; I32-NEXT: [[TMP78:%.*]] = load float, ptr [[TMP77]], align 4
708
+ ; I32-NEXT: [[TMP79:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 1
709
+ ; I32-NEXT: [[TMP80:%.*]] = load float, ptr [[TMP79]], align 4
710
+ ; I32-NEXT: [[TMP81:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 2
711
+ ; I32-NEXT: [[TMP82:%.*]] = load float, ptr [[TMP81]], align 4
712
+ ; I32-NEXT: [[TMP83:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 3
713
+ ; I32-NEXT: [[TMP84:%.*]] = load float, ptr [[TMP83]], align 4
714
+ ; I32-NEXT: [[TMP85:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 4
715
+ ; I32-NEXT: [[TMP86:%.*]] = load float, ptr [[TMP85]], align 4
716
+ ; I32-NEXT: [[TMP87:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 5
717
+ ; I32-NEXT: [[TMP88:%.*]] = load float, ptr [[TMP87]], align 4
718
+ ; I32-NEXT: [[TMP89:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 6
719
+ ; I32-NEXT: [[TMP90:%.*]] = load float, ptr [[TMP89]], align 4
720
+ ; I32-NEXT: [[TMP91:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 7
721
+ ; I32-NEXT: [[TMP92:%.*]] = load float, ptr [[TMP91]], align 4
722
+ ; I32-NEXT: [[TMP93:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
723
+ ; I32-NEXT: [[TMP94:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]]
724
+ ; I32-NEXT: [[TMP95:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP5]]
725
+ ; I32-NEXT: [[TMP96:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
726
+ ; I32-NEXT: [[TMP97:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
727
+ ; I32-NEXT: [[TMP98:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
728
+ ; I32-NEXT: [[TMP99:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP9]]
729
+ ; I32-NEXT: [[TMP100:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP10]]
730
+ ; I32-NEXT: store float [[TMP78]], ptr [[TMP93]], align 4
731
+ ; I32-NEXT: store float [[TMP80]], ptr [[TMP94]], align 4
732
+ ; I32-NEXT: store float [[TMP82]], ptr [[TMP95]], align 4
733
+ ; I32-NEXT: store float [[TMP84]], ptr [[TMP96]], align 4
734
+ ; I32-NEXT: store float [[TMP86]], ptr [[TMP97]], align 4
735
+ ; I32-NEXT: store float [[TMP88]], ptr [[TMP98]], align 4
736
+ ; I32-NEXT: store float [[TMP90]], ptr [[TMP99]], align 4
737
+ ; I32-NEXT: store float [[TMP92]], ptr [[TMP100]], align 4
738
+ ; I32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
739
+ ; I32-NEXT: [[TMP101:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
740
+ ; I32-NEXT: br i1 [[TMP101]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
741
+ ; I32: [[MIDDLE_BLOCK]]:
742
+ ; I32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
743
+ ; I32-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
744
+ ; I32: [[SCALAR_PH]]:
745
+ ;
746
+ entry:
747
+ br label %loop.header
748
+
749
+ loop.header:
750
+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %loop.latch ]
751
+ %iv.2 = phi i64 [ %start , %entry ], [ %iv.2.next , %loop.latch ]
752
+ %iv.1 = add i64 %iv , 1
753
+ %gep.src = getelementptr i8 , ptr %src , i64 %iv.1
754
+ %l.src = load float , ptr %gep.src , align 4
755
+ %c = fcmp oeq float %l.src , 0 .000000e+00
756
+ br i1 %c , label %then , label %loop.latch
757
+
758
+ then:
759
+ %iv.mul = mul i64 %iv.1 , %start
760
+ %gep.src.2 = getelementptr i8 , ptr %src.2 , i64 %iv.mul
761
+ br label %loop.latch
762
+
763
+ loop.latch:
764
+ %merge.gep = phi ptr [ %gep.src.2 , %then ], [ %src.2 , %loop.header ]
765
+ %l.2 = load float , ptr %merge.gep , align 4
766
+ %gep.dst = getelementptr i8 , ptr %dst , i64 %iv
767
+ store float %l.2 , ptr %gep.dst , align 4
768
+ %iv.next = add i64 %iv , 1
769
+ %iv.2.next = add i64 %iv.2 , -1
770
+ %ec = icmp sgt i64 %iv.2 , 100
771
+ br i1 %ec , label %loop.header , label %exit
772
+
773
+ exit:
774
+ ret void
775
+ }
776
+
777
+ attributes #0 = { "target-cpu" ="znver3" }
583
778
attributes #0 = { "target-cpu" ="znver2" }
584
779
585
780
!0 = distinct !{!0 , !1 }
0 commit comments