Skip to content

Commit 98ce434

Browse files
committed
[VPlan] Skip VPBlendRecipe in isUsedByLoadStoreAddress.
VPBlendRecipes are introduced as part of if-conversion, potentially adding a def-use chain from a load used in a compare to another load/store. In the scalar IR, there is no connection via def-use chains, so the legacy cost model won't consider the load used by memory operation. Skipping blends brings the VPlan-based cost-computation in line with the legacy cost model after llvm/llvm-project#162157.
1 parent eb1ce38 commit 98ce434

File tree

2 files changed

+196
-1
lines changed

2 files changed

+196
-1
lines changed

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3141,7 +3141,7 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
31413141

31423142
while (!WorkList.empty()) {
31433143
auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
3144-
if (!Cur || !Seen.insert(Cur).second)
3144+
if (!Cur || !Seen.insert(Cur).second || isa<VPBlendRecipe>(Cur))
31453145
continue;
31463146

31473147
for (VPUser *U : Cur->users()) {

llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,201 @@ exit:
580580
ret double %accum
581581
}
582582

583+
define void @loaded_address_used_by_load_through_blend(i64 %start, ptr noalias %src, ptr noalias %src.2, ptr noalias %dst) #0 {
584+
; I64-LABEL: define void @loaded_address_used_by_load_through_blend(
585+
; I64-SAME: i64 [[START:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0]] {
586+
; I64-NEXT: [[ENTRY:.*]]:
587+
; I64-NEXT: br label %[[LOOP_HEADER:.*]]
588+
; I64: [[LOOP_HEADER]]:
589+
; I64-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
590+
; I64-NEXT: [[IV_2:%.*]] = phi i64 [ [[START]], %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP_LATCH]] ]
591+
; I64-NEXT: [[IV_1:%.*]] = add i64 [[IV]], 1
592+
; I64-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_1]]
593+
; I64-NEXT: [[L_SRC:%.*]] = load float, ptr [[GEP_SRC]], align 4
594+
; I64-NEXT: [[C:%.*]] = fcmp oeq float [[L_SRC]], 0.000000e+00
595+
; I64-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
596+
; I64: [[THEN]]:
597+
; I64-NEXT: [[IV_MUL:%.*]] = mul i64 [[IV_1]], [[START]]
598+
; I64-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[IV_MUL]]
599+
; I64-NEXT: br label %[[LOOP_LATCH]]
600+
; I64: [[LOOP_LATCH]]:
601+
; I64-NEXT: [[MERGE_GEP:%.*]] = phi ptr [ [[GEP_SRC_2]], %[[THEN]] ], [ [[SRC_2]], %[[LOOP_HEADER]] ]
602+
; I64-NEXT: [[L_2:%.*]] = load float, ptr [[MERGE_GEP]], align 4
603+
; I64-NEXT: [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
604+
; I64-NEXT: store float [[L_2]], ptr [[GEP_DST]], align 4
605+
; I64-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
606+
; I64-NEXT: [[IV_2_NEXT]] = add i64 [[IV_2]], -1
607+
; I64-NEXT: [[EC:%.*]] = icmp sgt i64 [[IV_2]], 100
608+
; I64-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT:.*]]
609+
; I64: [[EXIT]]:
610+
; I64-NEXT: ret void
611+
;
612+
; I32-LABEL: define void @loaded_address_used_by_load_through_blend(
613+
; I32-SAME: i64 [[START:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0]] {
614+
; I32-NEXT: [[ENTRY:.*:]]
615+
; I32-NEXT: [[TMP0:%.*]] = add i64 [[START]], 1
616+
; I32-NEXT: [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 [[START]], i64 100)
617+
; I32-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[SMIN]]
618+
; I32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 8
619+
; I32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
620+
; I32: [[VECTOR_PH]]:
621+
; I32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 8
622+
; I32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
623+
; I32-NEXT: [[TMP2:%.*]] = sub i64 [[START]], [[N_VEC]]
624+
; I32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[START]], i64 0
625+
; I32-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
626+
; I32-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x ptr> poison, ptr [[SRC_2]], i64 0
627+
; I32-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT1]], <8 x ptr> poison, <8 x i32> zeroinitializer
628+
; I32-NEXT: br label %[[VECTOR_BODY:.*]]
629+
; I32: [[VECTOR_BODY]]:
630+
; I32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
631+
; I32-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
632+
; I32-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1
633+
; I32-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 2
634+
; I32-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 3
635+
; I32-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 4
636+
; I32-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 5
637+
; I32-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 6
638+
; I32-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 7
639+
; I32-NEXT: [[TMP11:%.*]] = add i64 [[TMP3]], 1
640+
; I32-NEXT: [[TMP12:%.*]] = add i64 [[TMP4]], 1
641+
; I32-NEXT: [[TMP13:%.*]] = add i64 [[TMP5]], 1
642+
; I32-NEXT: [[TMP14:%.*]] = add i64 [[TMP6]], 1
643+
; I32-NEXT: [[TMP15:%.*]] = add i64 [[TMP7]], 1
644+
; I32-NEXT: [[TMP16:%.*]] = add i64 [[TMP8]], 1
645+
; I32-NEXT: [[TMP17:%.*]] = add i64 [[TMP9]], 1
646+
; I32-NEXT: [[TMP18:%.*]] = add i64 [[TMP10]], 1
647+
; I32-NEXT: [[TMP19:%.*]] = insertelement <8 x i64> poison, i64 [[TMP11]], i32 0
648+
; I32-NEXT: [[TMP20:%.*]] = insertelement <8 x i64> [[TMP19]], i64 [[TMP12]], i32 1
649+
; I32-NEXT: [[TMP21:%.*]] = insertelement <8 x i64> [[TMP20]], i64 [[TMP13]], i32 2
650+
; I32-NEXT: [[TMP22:%.*]] = insertelement <8 x i64> [[TMP21]], i64 [[TMP14]], i32 3
651+
; I32-NEXT: [[TMP23:%.*]] = insertelement <8 x i64> [[TMP22]], i64 [[TMP15]], i32 4
652+
; I32-NEXT: [[TMP24:%.*]] = insertelement <8 x i64> [[TMP23]], i64 [[TMP16]], i32 5
653+
; I32-NEXT: [[TMP25:%.*]] = insertelement <8 x i64> [[TMP24]], i64 [[TMP17]], i32 6
654+
; I32-NEXT: [[TMP26:%.*]] = insertelement <8 x i64> [[TMP25]], i64 [[TMP18]], i32 7
655+
; I32-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP11]]
656+
; I32-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP12]]
657+
; I32-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
658+
; I32-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]]
659+
; I32-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP15]]
660+
; I32-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP16]]
661+
; I32-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
662+
; I32-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP18]]
663+
; I32-NEXT: [[TMP35:%.*]] = load float, ptr [[TMP27]], align 4
664+
; I32-NEXT: [[TMP36:%.*]] = load float, ptr [[TMP28]], align 4
665+
; I32-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP29]], align 4
666+
; I32-NEXT: [[TMP38:%.*]] = load float, ptr [[TMP30]], align 4
667+
; I32-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP31]], align 4
668+
; I32-NEXT: [[TMP40:%.*]] = load float, ptr [[TMP32]], align 4
669+
; I32-NEXT: [[TMP41:%.*]] = load float, ptr [[TMP33]], align 4
670+
; I32-NEXT: [[TMP42:%.*]] = load float, ptr [[TMP34]], align 4
671+
; I32-NEXT: [[TMP43:%.*]] = insertelement <8 x float> poison, float [[TMP35]], i32 0
672+
; I32-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP36]], i32 1
673+
; I32-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP37]], i32 2
674+
; I32-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP38]], i32 3
675+
; I32-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP39]], i32 4
676+
; I32-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP40]], i32 5
677+
; I32-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP41]], i32 6
678+
; I32-NEXT: [[TMP50:%.*]] = insertelement <8 x float> [[TMP49]], float [[TMP42]], i32 7
679+
; I32-NEXT: [[TMP51:%.*]] = fcmp oeq <8 x float> [[TMP50]], zeroinitializer
680+
; I32-NEXT: [[TMP52:%.*]] = mul <8 x i64> [[TMP26]], [[BROADCAST_SPLAT]]
681+
; I32-NEXT: [[TMP53:%.*]] = extractelement <8 x i64> [[TMP52]], i32 0
682+
; I32-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP53]]
683+
; I32-NEXT: [[TMP55:%.*]] = extractelement <8 x i64> [[TMP52]], i32 1
684+
; I32-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP55]]
685+
; I32-NEXT: [[TMP57:%.*]] = extractelement <8 x i64> [[TMP52]], i32 2
686+
; I32-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP57]]
687+
; I32-NEXT: [[TMP59:%.*]] = extractelement <8 x i64> [[TMP52]], i32 3
688+
; I32-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP59]]
689+
; I32-NEXT: [[TMP61:%.*]] = extractelement <8 x i64> [[TMP52]], i32 4
690+
; I32-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP61]]
691+
; I32-NEXT: [[TMP63:%.*]] = extractelement <8 x i64> [[TMP52]], i32 5
692+
; I32-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP63]]
693+
; I32-NEXT: [[TMP65:%.*]] = extractelement <8 x i64> [[TMP52]], i32 6
694+
; I32-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP65]]
695+
; I32-NEXT: [[TMP67:%.*]] = extractelement <8 x i64> [[TMP52]], i32 7
696+
; I32-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP67]]
697+
; I32-NEXT: [[TMP69:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP54]], i32 0
698+
; I32-NEXT: [[TMP70:%.*]] = insertelement <8 x ptr> [[TMP69]], ptr [[TMP56]], i32 1
699+
; I32-NEXT: [[TMP71:%.*]] = insertelement <8 x ptr> [[TMP70]], ptr [[TMP58]], i32 2
700+
; I32-NEXT: [[TMP72:%.*]] = insertelement <8 x ptr> [[TMP71]], ptr [[TMP60]], i32 3
701+
; I32-NEXT: [[TMP73:%.*]] = insertelement <8 x ptr> [[TMP72]], ptr [[TMP62]], i32 4
702+
; I32-NEXT: [[TMP74:%.*]] = insertelement <8 x ptr> [[TMP73]], ptr [[TMP64]], i32 5
703+
; I32-NEXT: [[TMP75:%.*]] = insertelement <8 x ptr> [[TMP74]], ptr [[TMP66]], i32 6
704+
; I32-NEXT: [[TMP76:%.*]] = insertelement <8 x ptr> [[TMP75]], ptr [[TMP68]], i32 7
705+
; I32-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[TMP51]], <8 x ptr> [[TMP76]], <8 x ptr> [[BROADCAST_SPLAT2]]
706+
; I32-NEXT: [[TMP77:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 0
707+
; I32-NEXT: [[TMP78:%.*]] = load float, ptr [[TMP77]], align 4
708+
; I32-NEXT: [[TMP79:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 1
709+
; I32-NEXT: [[TMP80:%.*]] = load float, ptr [[TMP79]], align 4
710+
; I32-NEXT: [[TMP81:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 2
711+
; I32-NEXT: [[TMP82:%.*]] = load float, ptr [[TMP81]], align 4
712+
; I32-NEXT: [[TMP83:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 3
713+
; I32-NEXT: [[TMP84:%.*]] = load float, ptr [[TMP83]], align 4
714+
; I32-NEXT: [[TMP85:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 4
715+
; I32-NEXT: [[TMP86:%.*]] = load float, ptr [[TMP85]], align 4
716+
; I32-NEXT: [[TMP87:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 5
717+
; I32-NEXT: [[TMP88:%.*]] = load float, ptr [[TMP87]], align 4
718+
; I32-NEXT: [[TMP89:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 6
719+
; I32-NEXT: [[TMP90:%.*]] = load float, ptr [[TMP89]], align 4
720+
; I32-NEXT: [[TMP91:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 7
721+
; I32-NEXT: [[TMP92:%.*]] = load float, ptr [[TMP91]], align 4
722+
; I32-NEXT: [[TMP93:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
723+
; I32-NEXT: [[TMP94:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]]
724+
; I32-NEXT: [[TMP95:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP5]]
725+
; I32-NEXT: [[TMP96:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
726+
; I32-NEXT: [[TMP97:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
727+
; I32-NEXT: [[TMP98:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
728+
; I32-NEXT: [[TMP99:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP9]]
729+
; I32-NEXT: [[TMP100:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP10]]
730+
; I32-NEXT: store float [[TMP78]], ptr [[TMP93]], align 4
731+
; I32-NEXT: store float [[TMP80]], ptr [[TMP94]], align 4
732+
; I32-NEXT: store float [[TMP82]], ptr [[TMP95]], align 4
733+
; I32-NEXT: store float [[TMP84]], ptr [[TMP96]], align 4
734+
; I32-NEXT: store float [[TMP86]], ptr [[TMP97]], align 4
735+
; I32-NEXT: store float [[TMP88]], ptr [[TMP98]], align 4
736+
; I32-NEXT: store float [[TMP90]], ptr [[TMP99]], align 4
737+
; I32-NEXT: store float [[TMP92]], ptr [[TMP100]], align 4
738+
; I32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
739+
; I32-NEXT: [[TMP101:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
740+
; I32-NEXT: br i1 [[TMP101]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
741+
; I32: [[MIDDLE_BLOCK]]:
742+
; I32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
743+
; I32-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
744+
; I32: [[SCALAR_PH]]:
745+
;
746+
entry:
747+
br label %loop.header
748+
749+
loop.header:
750+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
751+
%iv.2 = phi i64 [ %start, %entry ], [ %iv.2.next, %loop.latch ]
752+
%iv.1 = add i64 %iv, 1
753+
%gep.src = getelementptr i8, ptr %src, i64 %iv.1
754+
%l.src = load float, ptr %gep.src, align 4
755+
%c = fcmp oeq float %l.src, 0.000000e+00
756+
br i1 %c, label %then, label %loop.latch
757+
758+
then:
759+
%iv.mul = mul i64 %iv.1, %start
760+
%gep.src.2 = getelementptr i8, ptr %src.2, i64 %iv.mul
761+
br label %loop.latch
762+
763+
loop.latch:
764+
%merge.gep = phi ptr [ %gep.src.2, %then ], [ %src.2, %loop.header ]
765+
%l.2 = load float, ptr %merge.gep, align 4
766+
%gep.dst = getelementptr i8, ptr %dst, i64 %iv
767+
store float %l.2, ptr %gep.dst, align 4
768+
%iv.next = add i64 %iv, 1
769+
%iv.2.next = add i64 %iv.2, -1
770+
%ec = icmp sgt i64 %iv.2, 100
771+
br i1 %ec, label %loop.header, label %exit
772+
773+
exit:
774+
ret void
775+
}
776+
777+
attributes #0 = { "target-cpu"="znver3" }
583778
attributes #0 = { "target-cpu"="znver2" }
584779

585780
!0 = distinct !{!0, !1}

0 commit comments

Comments
 (0)