Skip to content

Commit ba69e33

Browse files
committed
[LV] Consistently apply address def scalarization across loop.
Consistently scalarize loads used as part of address computations across all uses in the loop. This aligns the VPlan and legacy cost model and fixes a divergence crash. It doesn't matter if the load and address users are in different blocks, as long as they are in the same loop, the scalar value can be used. This removes a number of insert/extracts.
1 parent 7886ae3 commit ba69e33

File tree

3 files changed

+229
-26
lines changed

3 files changed

+229
-26
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5696,7 +5696,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
56965696
Instruction *I = Worklist.pop_back_val();
56975697
for (auto &Op : I->operands())
56985698
if (auto *InstOp = dyn_cast<Instruction>(Op))
5699-
if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
5699+
if (TheLoop->contains(InstOp) && !isa<PHINode>(InstOp) &&
57005700
AddrDefs.insert(InstOp).second)
57015701
Worklist.push_back(InstOp);
57025702
}

llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll

Lines changed: 224 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -621,8 +621,6 @@ define void @loaded_address_used_by_load_through_blend(i64 %start, ptr noalias %
621621
; I32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 8
622622
; I32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
623623
; I32-NEXT: [[TMP2:%.*]] = sub i64 [[START]], [[N_VEC]]
624-
; I32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[START]], i64 0
625-
; I32-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
626624
; I32-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x ptr> poison, ptr [[SRC_2]], i64 0
627625
; I32-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT1]], <8 x ptr> poison, <8 x i32> zeroinitializer
628626
; I32-NEXT: br label %[[VECTOR_BODY:.*]]
@@ -644,14 +642,6 @@ define void @loaded_address_used_by_load_through_blend(i64 %start, ptr noalias %
644642
; I32-NEXT: [[TMP16:%.*]] = add i64 [[TMP8]], 1
645643
; I32-NEXT: [[TMP17:%.*]] = add i64 [[TMP9]], 1
646644
; I32-NEXT: [[TMP18:%.*]] = add i64 [[TMP10]], 1
647-
; I32-NEXT: [[TMP19:%.*]] = insertelement <8 x i64> poison, i64 [[TMP11]], i32 0
648-
; I32-NEXT: [[TMP20:%.*]] = insertelement <8 x i64> [[TMP19]], i64 [[TMP12]], i32 1
649-
; I32-NEXT: [[TMP21:%.*]] = insertelement <8 x i64> [[TMP20]], i64 [[TMP13]], i32 2
650-
; I32-NEXT: [[TMP22:%.*]] = insertelement <8 x i64> [[TMP21]], i64 [[TMP14]], i32 3
651-
; I32-NEXT: [[TMP23:%.*]] = insertelement <8 x i64> [[TMP22]], i64 [[TMP15]], i32 4
652-
; I32-NEXT: [[TMP24:%.*]] = insertelement <8 x i64> [[TMP23]], i64 [[TMP16]], i32 5
653-
; I32-NEXT: [[TMP25:%.*]] = insertelement <8 x i64> [[TMP24]], i64 [[TMP17]], i32 6
654-
; I32-NEXT: [[TMP26:%.*]] = insertelement <8 x i64> [[TMP25]], i64 [[TMP18]], i32 7
655645
; I32-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP11]]
656646
; I32-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP12]]
657647
; I32-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
@@ -677,22 +667,21 @@ define void @loaded_address_used_by_load_through_blend(i64 %start, ptr noalias %
677667
; I32-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP41]], i32 6
678668
; I32-NEXT: [[TMP50:%.*]] = insertelement <8 x float> [[TMP49]], float [[TMP42]], i32 7
679669
; I32-NEXT: [[TMP51:%.*]] = fcmp oeq <8 x float> [[TMP50]], zeroinitializer
680-
; I32-NEXT: [[TMP52:%.*]] = mul <8 x i64> [[TMP26]], [[BROADCAST_SPLAT]]
681-
; I32-NEXT: [[TMP53:%.*]] = extractelement <8 x i64> [[TMP52]], i32 0
670+
; I32-NEXT: [[TMP53:%.*]] = mul i64 [[TMP11]], [[START]]
671+
; I32-NEXT: [[TMP55:%.*]] = mul i64 [[TMP12]], [[START]]
672+
; I32-NEXT: [[TMP57:%.*]] = mul i64 [[TMP13]], [[START]]
673+
; I32-NEXT: [[TMP59:%.*]] = mul i64 [[TMP14]], [[START]]
674+
; I32-NEXT: [[TMP61:%.*]] = mul i64 [[TMP15]], [[START]]
675+
; I32-NEXT: [[TMP63:%.*]] = mul i64 [[TMP16]], [[START]]
676+
; I32-NEXT: [[TMP65:%.*]] = mul i64 [[TMP17]], [[START]]
677+
; I32-NEXT: [[TMP67:%.*]] = mul i64 [[TMP18]], [[START]]
682678
; I32-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP53]]
683-
; I32-NEXT: [[TMP55:%.*]] = extractelement <8 x i64> [[TMP52]], i32 1
684679
; I32-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP55]]
685-
; I32-NEXT: [[TMP57:%.*]] = extractelement <8 x i64> [[TMP52]], i32 2
686680
; I32-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP57]]
687-
; I32-NEXT: [[TMP59:%.*]] = extractelement <8 x i64> [[TMP52]], i32 3
688681
; I32-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP59]]
689-
; I32-NEXT: [[TMP61:%.*]] = extractelement <8 x i64> [[TMP52]], i32 4
690682
; I32-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP61]]
691-
; I32-NEXT: [[TMP63:%.*]] = extractelement <8 x i64> [[TMP52]], i32 5
692683
; I32-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP63]]
693-
; I32-NEXT: [[TMP65:%.*]] = extractelement <8 x i64> [[TMP52]], i32 6
694684
; I32-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP65]]
695-
; I32-NEXT: [[TMP67:%.*]] = extractelement <8 x i64> [[TMP52]], i32 7
696685
; I32-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP67]]
697686
; I32-NEXT: [[TMP69:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP54]], i32 0
698687
; I32-NEXT: [[TMP70:%.*]] = insertelement <8 x ptr> [[TMP69]], ptr [[TMP56]], i32 1
@@ -774,7 +763,222 @@ exit:
774763
ret void
775764
}
776765

777-
attributes #0 = { "target-cpu"="znver3" }
766+
define void @address_use_in_different_block(ptr noalias %dst, ptr %src.0, ptr %src.1, i32 %x) #0 {
767+
; I64-LABEL: define void @address_use_in_different_block(
768+
; I64-SAME: ptr noalias [[DST:%.*]], ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
769+
; I64-NEXT: [[ENTRY:.*:]]
770+
; I64-NEXT: [[X_POS:%.*]] = call i32 @llvm.smax.i32(i32 [[X]], i32 0)
771+
; I64-NEXT: [[OFFSET:%.*]] = zext i32 [[X_POS]] to i64
772+
; I64-NEXT: br label %[[VECTOR_PH:.*]]
773+
; I64: [[VECTOR_PH]]:
774+
; I64-NEXT: br label %[[VECTOR_BODY:.*]]
775+
; I64: [[VECTOR_BODY]]:
776+
; I64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
777+
; I64-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
778+
; I64-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
779+
; I64-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
780+
; I64-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
781+
; I64-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
782+
; I64-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
783+
; I64-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
784+
; I64-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
785+
; I64-NEXT: [[TMP8:%.*]] = mul i64 [[TMP0]], [[OFFSET]]
786+
; I64-NEXT: [[TMP9:%.*]] = mul i64 [[TMP1]], [[OFFSET]]
787+
; I64-NEXT: [[TMP10:%.*]] = mul i64 [[TMP2]], [[OFFSET]]
788+
; I64-NEXT: [[TMP11:%.*]] = mul i64 [[TMP3]], [[OFFSET]]
789+
; I64-NEXT: [[TMP12:%.*]] = mul i64 [[TMP4]], [[OFFSET]]
790+
; I64-NEXT: [[TMP13:%.*]] = mul i64 [[TMP5]], [[OFFSET]]
791+
; I64-NEXT: [[TMP14:%.*]] = mul i64 [[TMP6]], [[OFFSET]]
792+
; I64-NEXT: [[TMP15:%.*]] = mul i64 [[TMP7]], [[OFFSET]]
793+
; I64-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP8]]
794+
; I64-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP9]]
795+
; I64-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP10]]
796+
; I64-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP11]]
797+
; I64-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP12]]
798+
; I64-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP13]]
799+
; I64-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP14]]
800+
; I64-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP15]]
801+
; I64-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP16]], align 4
802+
; I64-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP17]], align 4
803+
; I64-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP18]], align 4
804+
; I64-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP19]], align 4
805+
; I64-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP20]], align 4
806+
; I64-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP21]], align 4
807+
; I64-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP22]], align 4
808+
; I64-NEXT: [[TMP31:%.*]] = load i32, ptr [[TMP23]], align 4
809+
; I64-NEXT: [[TMP32:%.*]] = sext i32 [[TMP24]] to i64
810+
; I64-NEXT: [[TMP33:%.*]] = sext i32 [[TMP25]] to i64
811+
; I64-NEXT: [[TMP34:%.*]] = sext i32 [[TMP26]] to i64
812+
; I64-NEXT: [[TMP35:%.*]] = sext i32 [[TMP27]] to i64
813+
; I64-NEXT: [[TMP36:%.*]] = sext i32 [[TMP28]] to i64
814+
; I64-NEXT: [[TMP37:%.*]] = sext i32 [[TMP29]] to i64
815+
; I64-NEXT: [[TMP38:%.*]] = sext i32 [[TMP30]] to i64
816+
; I64-NEXT: [[TMP39:%.*]] = sext i32 [[TMP31]] to i64
817+
; I64-NEXT: [[TMP40:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP32]]
818+
; I64-NEXT: [[TMP41:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP33]]
819+
; I64-NEXT: [[TMP42:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP34]]
820+
; I64-NEXT: [[TMP43:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP35]]
821+
; I64-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP36]]
822+
; I64-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP37]]
823+
; I64-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP38]]
824+
; I64-NEXT: [[TMP47:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP39]]
825+
; I64-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[TMP40]], i64 -8
826+
; I64-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[TMP41]], i64 -8
827+
; I64-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[TMP42]], i64 -8
828+
; I64-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[TMP43]], i64 -8
829+
; I64-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[TMP44]], i64 -8
830+
; I64-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr [[TMP45]], i64 -8
831+
; I64-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[TMP46]], i64 -8
832+
; I64-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr [[TMP47]], i64 -8
833+
; I64-NEXT: [[TMP56:%.*]] = load double, ptr [[TMP48]], align 8
834+
; I64-NEXT: [[TMP57:%.*]] = load double, ptr [[TMP49]], align 8
835+
; I64-NEXT: [[TMP58:%.*]] = insertelement <2 x double> poison, double [[TMP56]], i32 0
836+
; I64-NEXT: [[TMP59:%.*]] = insertelement <2 x double> [[TMP58]], double [[TMP57]], i32 1
837+
; I64-NEXT: [[TMP60:%.*]] = load double, ptr [[TMP50]], align 8
838+
; I64-NEXT: [[TMP61:%.*]] = load double, ptr [[TMP51]], align 8
839+
; I64-NEXT: [[TMP62:%.*]] = insertelement <2 x double> poison, double [[TMP60]], i32 0
840+
; I64-NEXT: [[TMP63:%.*]] = insertelement <2 x double> [[TMP62]], double [[TMP61]], i32 1
841+
; I64-NEXT: [[TMP64:%.*]] = load double, ptr [[TMP52]], align 8
842+
; I64-NEXT: [[TMP65:%.*]] = load double, ptr [[TMP53]], align 8
843+
; I64-NEXT: [[TMP66:%.*]] = insertelement <2 x double> poison, double [[TMP64]], i32 0
844+
; I64-NEXT: [[TMP67:%.*]] = insertelement <2 x double> [[TMP66]], double [[TMP65]], i32 1
845+
; I64-NEXT: [[TMP68:%.*]] = load double, ptr [[TMP54]], align 8
846+
; I64-NEXT: [[TMP69:%.*]] = load double, ptr [[TMP55]], align 8
847+
; I64-NEXT: [[TMP70:%.*]] = insertelement <2 x double> poison, double [[TMP68]], i32 0
848+
; I64-NEXT: [[TMP71:%.*]] = insertelement <2 x double> [[TMP70]], double [[TMP69]], i32 1
849+
; I64-NEXT: [[TMP72:%.*]] = fsub <2 x double> zeroinitializer, [[TMP59]]
850+
; I64-NEXT: [[TMP73:%.*]] = fsub <2 x double> zeroinitializer, [[TMP63]]
851+
; I64-NEXT: [[TMP74:%.*]] = fsub <2 x double> zeroinitializer, [[TMP67]]
852+
; I64-NEXT: [[TMP75:%.*]] = fsub <2 x double> zeroinitializer, [[TMP71]]
853+
; I64-NEXT: [[TMP76:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP8]]
854+
; I64-NEXT: [[TMP77:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP9]]
855+
; I64-NEXT: [[TMP78:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]]
856+
; I64-NEXT: [[TMP79:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]]
857+
; I64-NEXT: [[TMP80:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]]
858+
; I64-NEXT: [[TMP81:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]]
859+
; I64-NEXT: [[TMP82:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP14]]
860+
; I64-NEXT: [[TMP83:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP15]]
861+
; I64-NEXT: [[TMP84:%.*]] = extractelement <2 x double> [[TMP72]], i32 0
862+
; I64-NEXT: store double [[TMP84]], ptr [[TMP76]], align 8
863+
; I64-NEXT: [[TMP85:%.*]] = extractelement <2 x double> [[TMP72]], i32 1
864+
; I64-NEXT: store double [[TMP85]], ptr [[TMP77]], align 8
865+
; I64-NEXT: [[TMP86:%.*]] = extractelement <2 x double> [[TMP73]], i32 0
866+
; I64-NEXT: store double [[TMP86]], ptr [[TMP78]], align 8
867+
; I64-NEXT: [[TMP87:%.*]] = extractelement <2 x double> [[TMP73]], i32 1
868+
; I64-NEXT: store double [[TMP87]], ptr [[TMP79]], align 8
869+
; I64-NEXT: [[TMP88:%.*]] = extractelement <2 x double> [[TMP74]], i32 0
870+
; I64-NEXT: store double [[TMP88]], ptr [[TMP80]], align 8
871+
; I64-NEXT: [[TMP89:%.*]] = extractelement <2 x double> [[TMP74]], i32 1
872+
; I64-NEXT: store double [[TMP89]], ptr [[TMP81]], align 8
873+
; I64-NEXT: [[TMP90:%.*]] = extractelement <2 x double> [[TMP75]], i32 0
874+
; I64-NEXT: store double [[TMP90]], ptr [[TMP82]], align 8
875+
; I64-NEXT: [[TMP91:%.*]] = extractelement <2 x double> [[TMP75]], i32 1
876+
; I64-NEXT: store double [[TMP91]], ptr [[TMP83]], align 8
877+
; I64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
878+
; I64-NEXT: [[TMP92:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
879+
; I64-NEXT: br i1 [[TMP92]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
880+
; I64: [[MIDDLE_BLOCK]]:
881+
; I64-NEXT: br label %[[SCALAR_PH:.*]]
882+
; I64: [[SCALAR_PH]]:
883+
;
884+
; I32-LABEL: define void @address_use_in_different_block(
885+
; I32-SAME: ptr noalias [[DST:%.*]], ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
886+
; I32-NEXT: [[ENTRY:.*:]]
887+
; I32-NEXT: [[X_POS:%.*]] = call i32 @llvm.smax.i32(i32 [[X]], i32 0)
888+
; I32-NEXT: [[OFFSET:%.*]] = zext i32 [[X_POS]] to i64
889+
; I32-NEXT: br label %[[VECTOR_PH:.*]]
890+
; I32: [[VECTOR_PH]]:
891+
; I32-NEXT: br label %[[VECTOR_BODY:.*]]
892+
; I32: [[VECTOR_BODY]]:
893+
; I32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
894+
; I32-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
895+
; I32-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
896+
; I32-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
897+
; I32-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
898+
; I32-NEXT: [[TMP4:%.*]] = mul i64 [[TMP0]], [[OFFSET]]
899+
; I32-NEXT: [[TMP5:%.*]] = mul i64 [[TMP1]], [[OFFSET]]
900+
; I32-NEXT: [[TMP6:%.*]] = mul i64 [[TMP2]], [[OFFSET]]
901+
; I32-NEXT: [[TMP7:%.*]] = mul i64 [[TMP3]], [[OFFSET]]
902+
; I32-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP4]]
903+
; I32-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP5]]
904+
; I32-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP6]]
905+
; I32-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP7]]
906+
; I32-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP8]], align 4
907+
; I32-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP9]], align 4
908+
; I32-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP10]], align 4
909+
; I32-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4
910+
; I32-NEXT: [[TMP16:%.*]] = sext i32 [[TMP12]] to i64
911+
; I32-NEXT: [[TMP17:%.*]] = sext i32 [[TMP13]] to i64
912+
; I32-NEXT: [[TMP18:%.*]] = sext i32 [[TMP14]] to i64
913+
; I32-NEXT: [[TMP19:%.*]] = sext i32 [[TMP15]] to i64
914+
; I32-NEXT: [[TMP20:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP16]]
915+
; I32-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP17]]
916+
; I32-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP18]]
917+
; I32-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP19]]
918+
; I32-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP20]], i64 -8
919+
; I32-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 -8
920+
; I32-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP22]], i64 -8
921+
; I32-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP23]], i64 -8
922+
; I32-NEXT: [[TMP28:%.*]] = load double, ptr [[TMP24]], align 8
923+
; I32-NEXT: [[TMP29:%.*]] = load double, ptr [[TMP25]], align 8
924+
; I32-NEXT: [[TMP30:%.*]] = load double, ptr [[TMP26]], align 8
925+
; I32-NEXT: [[TMP31:%.*]] = load double, ptr [[TMP27]], align 8
926+
; I32-NEXT: [[TMP32:%.*]] = insertelement <4 x double> poison, double [[TMP28]], i32 0
927+
; I32-NEXT: [[TMP33:%.*]] = insertelement <4 x double> [[TMP32]], double [[TMP29]], i32 1
928+
; I32-NEXT: [[TMP34:%.*]] = insertelement <4 x double> [[TMP33]], double [[TMP30]], i32 2
929+
; I32-NEXT: [[TMP35:%.*]] = insertelement <4 x double> [[TMP34]], double [[TMP31]], i32 3
930+
; I32-NEXT: [[TMP36:%.*]] = fsub <4 x double> zeroinitializer, [[TMP35]]
931+
; I32-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP4]]
932+
; I32-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP5]]
933+
; I32-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP6]]
934+
; I32-NEXT: [[TMP40:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP7]]
935+
; I32-NEXT: [[TMP41:%.*]] = extractelement <4 x double> [[TMP36]], i32 0
936+
; I32-NEXT: store double [[TMP41]], ptr [[TMP37]], align 8
937+
; I32-NEXT: [[TMP42:%.*]] = extractelement <4 x double> [[TMP36]], i32 1
938+
; I32-NEXT: store double [[TMP42]], ptr [[TMP38]], align 8
939+
; I32-NEXT: [[TMP43:%.*]] = extractelement <4 x double> [[TMP36]], i32 2
940+
; I32-NEXT: store double [[TMP43]], ptr [[TMP39]], align 8
941+
; I32-NEXT: [[TMP44:%.*]] = extractelement <4 x double> [[TMP36]], i32 3
942+
; I32-NEXT: store double [[TMP44]], ptr [[TMP40]], align 8
943+
; I32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
944+
; I32-NEXT: [[TMP45:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
945+
; I32-NEXT: br i1 [[TMP45]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
946+
; I32: [[MIDDLE_BLOCK]]:
947+
; I32-NEXT: br label %[[SCALAR_PH:.*]]
948+
; I32: [[SCALAR_PH]]:
949+
;
950+
entry:
951+
%x.pos = call i32 @llvm.smax.i32(i32 %x, i32 0)
952+
%offset = zext i32 %x.pos to i64
953+
br label %loop.header
954+
955+
loop.header:
956+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
957+
%7 = mul i64 %iv, %offset
958+
%gep.src.0 = getelementptr i32, ptr %src.0, i64 %7
959+
%l8 = load i32, ptr %gep.src.0, align 4
960+
%c = icmp sgt i32 %x, 0
961+
br i1 %c, label %loop.latch, label %then
962+
963+
then:
964+
br label %loop.latch
965+
966+
loop.latch:
967+
%l.ext = sext i32 %l8 to i64
968+
%gep.src.1 = getelementptr double, ptr %src.1, i64 %l.ext
969+
%13 = getelementptr i8, ptr %gep.src.1, i64 -8
970+
%l.2 = load double, ptr %13, align 8
971+
%sub = fsub double 0.000000e+00, %l.2
972+
%gep.dst = getelementptr double, ptr %dst, i64 %7
973+
store double %sub, ptr %gep.dst, align 8
974+
%iv.next = add i64 %iv, 1
975+
%ec = icmp eq i64 %iv, 100
976+
br i1 %ec, label %exit, label %loop.header
977+
978+
exit:
979+
ret void
980+
}
981+
778982
attributes #0 = { "target-cpu"="znver2" }
779983

780984
!0 = distinct !{!0, !1}

0 commit comments

Comments
 (0)