Skip to content

Commit f0e1254

Browse files
authored
[LV] Use forced cost once for whole interleave group in legacy costmodel (#168270)
The VPlan-based cost model assigns the forced cost once for a whole VPInterleaveRecipe. Update the legacy cost model to match this behavior. This fixes a cost-model divergence, and assigns the cost in a way that matches the generated code more accurately. PR: #168270
1 parent 139ebfa commit f0e1254

File tree

2 files changed

+171
-3
lines changed

2 files changed

+171
-3
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5122,8 +5122,18 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
51225122
InstructionCost C = getInstructionCost(&I, VF);
51235123

51245124
// Check if we should override the cost.
5125-
if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5126-
C = InstructionCost(ForceTargetInstructionCost);
5125+
if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) {
5126+
// For interleave groups, use ForceTargetInstructionCost once for the
5127+
// whole group.
5128+
if (VF.isVector() && getWideningDecision(&I, VF) == CM_Interleave) {
5129+
if (getInterleavedAccessGroup(&I)->getInsertPos() == &I)
5130+
C = InstructionCost(ForceTargetInstructionCost);
5131+
else
5132+
C = InstructionCost(0);
5133+
} else {
5134+
C = InstructionCost(ForceTargetInstructionCost);
5135+
}
5136+
}
51275137

51285138
BlockCost += C;
51295139
LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "

llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll

Lines changed: 159 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ for.end:
380380
ret void
381381
}
382382

383-
define void @loop_with_freeze_and_conditional_srem(ptr %dst, ptr %keyinfo, ptr %invariant.ptr, i32 %divisor) #1 {
383+
define void @loop_with_freeze_and_conditional_srem(ptr %dst, ptr %keyinfo, ptr %invariant.ptr, i32 %divisor) {
384384
; COMMON-LABEL: define void @loop_with_freeze_and_conditional_srem(
385385
; COMMON-SAME: ptr [[DST:%.*]], ptr [[KEYINFO:%.*]], ptr [[INVARIANT_PTR:%.*]], i32 [[DIVISOR:%.*]]) {
386386
; COMMON-NEXT: [[ENTRY:.*]]:
@@ -433,7 +433,165 @@ exit: ; preds = %loop.latch
433433
ret void
434434
}
435435

436+
define void @interleave_group(ptr %dst) #1 {
437+
; COST1-LABEL: define void @interleave_group(
438+
; COST1-SAME: ptr [[DST:%.*]]) #[[ATTR1:[0-9]+]] {
439+
; COST1-NEXT: [[ITER_CHECK:.*:]]
440+
; COST1-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
441+
; COST1: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
442+
; COST1-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
443+
; COST1: [[VECTOR_PH]]:
444+
; COST1-NEXT: br label %[[VECTOR_BODY:.*]]
445+
; COST1: [[VECTOR_BODY]]:
446+
; COST1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
447+
; COST1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 16
448+
; COST1-NEXT: [[TMP1:%.*]] = mul i64 [[INDEX]], 3
449+
; COST1-NEXT: [[TMP2:%.*]] = mul i64 [[TMP0]], 3
450+
; COST1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP1]]
451+
; COST1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]]
452+
; COST1-NEXT: store <48 x i8> zeroinitializer, ptr [[TMP3]], align 1
453+
; COST1-NEXT: store <48 x i8> zeroinitializer, ptr [[TMP4]], align 1
454+
; COST1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
455+
; COST1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
456+
; COST1-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
457+
; COST1: [[MIDDLE_BLOCK]]:
458+
; COST1-NEXT: br i1 false, [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
459+
; COST1: [[VEC_EPILOG_ITER_CHECK]]:
460+
; COST1-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF4]]
461+
; COST1: [[VEC_EPILOG_PH]]:
462+
; COST1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
463+
; COST1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
464+
; COST1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
465+
; COST1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
466+
; COST1-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
467+
; COST1: [[VEC_EPILOG_VECTOR_BODY]]:
468+
; COST1-NEXT: [[INDEX1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
469+
; COST1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
470+
; COST1-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[VEC_IND]], splat (i64 3)
471+
; COST1-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
472+
; COST1-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
473+
; COST1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
474+
; COST1-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
475+
; COST1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
476+
; COST1-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
477+
; COST1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP9]]
478+
; COST1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP10]]
479+
; COST1-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP11]], i64 2
480+
; COST1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP12]], i64 2
481+
; COST1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 2
482+
; COST1-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 2
483+
; COST1-NEXT: store i8 0, ptr [[TMP15]], align 1
484+
; COST1-NEXT: store i8 0, ptr [[TMP16]], align 1
485+
; COST1-NEXT: store i8 0, ptr [[TMP17]], align 1
486+
; COST1-NEXT: store i8 0, ptr [[TMP18]], align 1
487+
; COST1-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP11]], i64 1
488+
; COST1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP12]], i64 1
489+
; COST1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP13]], i64 1
490+
; COST1-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[TMP14]], i64 1
491+
; COST1-NEXT: store i8 0, ptr [[TMP19]], align 1
492+
; COST1-NEXT: store i8 0, ptr [[TMP20]], align 1
493+
; COST1-NEXT: store i8 0, ptr [[TMP21]], align 1
494+
; COST1-NEXT: store i8 0, ptr [[TMP22]], align 1
495+
; COST1-NEXT: store i8 0, ptr [[TMP11]], align 1
496+
; COST1-NEXT: store i8 0, ptr [[TMP12]], align 1
497+
; COST1-NEXT: store i8 0, ptr [[TMP13]], align 1
498+
; COST1-NEXT: store i8 0, ptr [[TMP14]], align 1
499+
; COST1-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
500+
; COST1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
501+
; COST1-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 100
502+
; COST1-NEXT: br i1 [[TMP23]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
503+
; COST1: [[VEC_EPILOG_MIDDLE_BLOCK]]:
504+
; COST1-NEXT: br i1 false, [[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
505+
; COST1: [[VEC_EPILOG_SCALAR_PH]]:
506+
;
507+
; COST10-LABEL: define void @interleave_group(
508+
; COST10-SAME: ptr [[DST:%.*]]) #[[ATTR1:[0-9]+]] {
509+
; COST10-NEXT: [[ITER_CHECK:.*:]]
510+
; COST10-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
511+
; COST10: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
512+
; COST10-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
513+
; COST10: [[VECTOR_PH]]:
514+
; COST10-NEXT: br label %[[VECTOR_BODY:.*]]
515+
; COST10: [[VECTOR_BODY]]:
516+
; COST10-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
517+
; COST10-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 3
518+
; COST10-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
519+
; COST10-NEXT: store <48 x i8> zeroinitializer, ptr [[TMP1]], align 1
520+
; COST10-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
521+
; COST10-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
522+
; COST10-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
523+
; COST10: [[MIDDLE_BLOCK]]:
524+
; COST10-NEXT: br i1 false, [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
525+
; COST10: [[VEC_EPILOG_ITER_CHECK]]:
526+
; COST10-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF4]]
527+
; COST10: [[VEC_EPILOG_PH]]:
528+
; COST10-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
529+
; COST10-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
530+
; COST10-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
531+
; COST10-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
532+
; COST10-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
533+
; COST10: [[VEC_EPILOG_VECTOR_BODY]]:
534+
; COST10-NEXT: [[INDEX1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
535+
; COST10-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
536+
; COST10-NEXT: [[TMP3:%.*]] = mul <4 x i64> [[VEC_IND]], splat (i64 3)
537+
; COST10-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
538+
; COST10-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
539+
; COST10-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
540+
; COST10-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
541+
; COST10-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]]
542+
; COST10-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP5]]
543+
; COST10-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
544+
; COST10-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
545+
; COST10-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 2
546+
; COST10-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP9]], i64 2
547+
; COST10-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 2
548+
; COST10-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP11]], i64 2
549+
; COST10-NEXT: store i8 0, ptr [[TMP12]], align 1
550+
; COST10-NEXT: store i8 0, ptr [[TMP13]], align 1
551+
; COST10-NEXT: store i8 0, ptr [[TMP14]], align 1
552+
; COST10-NEXT: store i8 0, ptr [[TMP15]], align 1
553+
; COST10-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP8]], i64 1
554+
; COST10-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP9]], i64 1
555+
; COST10-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP10]], i64 1
556+
; COST10-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP11]], i64 1
557+
; COST10-NEXT: store i8 0, ptr [[TMP16]], align 1
558+
; COST10-NEXT: store i8 0, ptr [[TMP17]], align 1
559+
; COST10-NEXT: store i8 0, ptr [[TMP18]], align 1
560+
; COST10-NEXT: store i8 0, ptr [[TMP19]], align 1
561+
; COST10-NEXT: store i8 0, ptr [[TMP8]], align 1
562+
; COST10-NEXT: store i8 0, ptr [[TMP9]], align 1
563+
; COST10-NEXT: store i8 0, ptr [[TMP10]], align 1
564+
; COST10-NEXT: store i8 0, ptr [[TMP11]], align 1
565+
; COST10-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
566+
; COST10-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
567+
; COST10-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 100
568+
; COST10-NEXT: br i1 [[TMP20]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
569+
; COST10: [[VEC_EPILOG_MIDDLE_BLOCK]]:
570+
; COST10-NEXT: br i1 false, [[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
571+
; COST10: [[VEC_EPILOG_SCALAR_PH]]:
572+
;
573+
entry:
574+
br label %loop
575+
576+
loop:
577+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
578+
%iv.3 = mul i64 %iv, 3
579+
%gep.0 = getelementptr i8, ptr %dst, i64 %iv.3
580+
%gep.2 = getelementptr i8, ptr %gep.0, i64 2
581+
store i8 0, ptr %gep.2, align 1
582+
%gep.1 = getelementptr i8, ptr %gep.0, i64 1
583+
store i8 0, ptr %gep.1, align 1
584+
store i8 0, ptr %gep.0, align 1
585+
%iv.next = add i64 %iv, 1
586+
%ec = icmp eq i64 %iv, 100
587+
br i1 %ec, label %exit, label %loop
588+
589+
exit:
590+
ret void
591+
}
592+
436593
attributes #0 = { "target-features"="+neon,+sve" vscale_range(1,16) }
594+
attributes #1 = { "target-cpu"="neoverse-512tvb" }
437595

438596
declare void @llvm.assume(i1 noundef)
439597
declare i64 @llvm.umin.i64(i64, i64)

0 commit comments

Comments
 (0)