Skip to content

Commit 784fb39

Browse files
committed
[LV] Use forced cost once for whole interleave group in legacy costmodel
The VPlan-based cost model assigns the forced cost once for a whole VPInterleaveRecipe. Update the legacy cost model to match this behavior. This fixes a cost-model divergence, and assigns the cost in a way that matches the generated code more accurately.
1 parent a464e38 commit 784fb39

File tree

2 files changed

+170
-2
lines changed

2 files changed

+170
-2
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5122,8 +5122,18 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
51225122
InstructionCost C = getInstructionCost(&I, VF);
51235123

51245124
// Check if we should override the cost.
5125-
if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5126-
C = InstructionCost(ForceTargetInstructionCost);
5125+
if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) {
5126+
// For interleave groups, use ForceTargetInstructionCost once for the
5127+
// whole group.
5128+
if (VF.isVector() && getWideningDecision(&I, VF) == CM_Interleave) {
5129+
if (getInterleavedAccessGroup(&I)->getInsertPos() == &I)
5130+
C = InstructionCost(ForceTargetInstructionCost);
5131+
else
5132+
C = InstructionCost(0);
5133+
} else {
5134+
C = InstructionCost(ForceTargetInstructionCost);
5135+
}
5136+
}
51275137

51285138
BlockCost += C;
51295139
LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "

llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,165 @@ for.end:
380380
ret void
381381
}
382382

383+
define void @interleave_group(ptr %dst) #1 {
384+
; COST1-LABEL: define void @interleave_group(
385+
; COST1-SAME: ptr [[DST:%.*]]) #[[ATTR1:[0-9]+]] {
386+
; COST1-NEXT: [[ITER_CHECK:.*:]]
387+
; COST1-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
388+
; COST1: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
389+
; COST1-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
390+
; COST1: [[VECTOR_PH]]:
391+
; COST1-NEXT: br label %[[VECTOR_BODY:.*]]
392+
; COST1: [[VECTOR_BODY]]:
393+
; COST1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
394+
; COST1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 16
395+
; COST1-NEXT: [[TMP1:%.*]] = mul i64 [[INDEX]], 3
396+
; COST1-NEXT: [[TMP2:%.*]] = mul i64 [[TMP0]], 3
397+
; COST1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP1]]
398+
; COST1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]]
399+
; COST1-NEXT: store <48 x i8> zeroinitializer, ptr [[TMP3]], align 1
400+
; COST1-NEXT: store <48 x i8> zeroinitializer, ptr [[TMP4]], align 1
401+
; COST1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
402+
; COST1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
403+
; COST1-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
404+
; COST1: [[MIDDLE_BLOCK]]:
405+
; COST1-NEXT: br i1 false, [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
406+
; COST1: [[VEC_EPILOG_ITER_CHECK]]:
407+
; COST1-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF4]]
408+
; COST1: [[VEC_EPILOG_PH]]:
409+
; COST1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
410+
; COST1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
411+
; COST1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
412+
; COST1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
413+
; COST1-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
414+
; COST1: [[VEC_EPILOG_VECTOR_BODY]]:
415+
; COST1-NEXT: [[INDEX1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
416+
; COST1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
417+
; COST1-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[VEC_IND]], splat (i64 3)
418+
; COST1-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
419+
; COST1-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
420+
; COST1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
421+
; COST1-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
422+
; COST1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
423+
; COST1-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
424+
; COST1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP9]]
425+
; COST1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP10]]
426+
; COST1-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP11]], i64 2
427+
; COST1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP12]], i64 2
428+
; COST1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 2
429+
; COST1-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 2
430+
; COST1-NEXT: store i8 0, ptr [[TMP15]], align 1
431+
; COST1-NEXT: store i8 0, ptr [[TMP16]], align 1
432+
; COST1-NEXT: store i8 0, ptr [[TMP17]], align 1
433+
; COST1-NEXT: store i8 0, ptr [[TMP18]], align 1
434+
; COST1-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP11]], i64 1
435+
; COST1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP12]], i64 1
436+
; COST1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP13]], i64 1
437+
; COST1-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[TMP14]], i64 1
438+
; COST1-NEXT: store i8 0, ptr [[TMP19]], align 1
439+
; COST1-NEXT: store i8 0, ptr [[TMP20]], align 1
440+
; COST1-NEXT: store i8 0, ptr [[TMP21]], align 1
441+
; COST1-NEXT: store i8 0, ptr [[TMP22]], align 1
442+
; COST1-NEXT: store i8 0, ptr [[TMP11]], align 1
443+
; COST1-NEXT: store i8 0, ptr [[TMP12]], align 1
444+
; COST1-NEXT: store i8 0, ptr [[TMP13]], align 1
445+
; COST1-NEXT: store i8 0, ptr [[TMP14]], align 1
446+
; COST1-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
447+
; COST1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
448+
; COST1-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 100
449+
; COST1-NEXT: br i1 [[TMP23]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
450+
; COST1: [[VEC_EPILOG_MIDDLE_BLOCK]]:
451+
; COST1-NEXT: br i1 false, [[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
452+
; COST1: [[VEC_EPILOG_SCALAR_PH]]:
453+
;
454+
; COST10-LABEL: define void @interleave_group(
455+
; COST10-SAME: ptr [[DST:%.*]]) #[[ATTR1:[0-9]+]] {
456+
; COST10-NEXT: [[ITER_CHECK:.*:]]
457+
; COST10-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
458+
; COST10: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
459+
; COST10-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
460+
; COST10: [[VECTOR_PH]]:
461+
; COST10-NEXT: br label %[[VECTOR_BODY:.*]]
462+
; COST10: [[VECTOR_BODY]]:
463+
; COST10-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
464+
; COST10-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 3
465+
; COST10-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
466+
; COST10-NEXT: store <48 x i8> zeroinitializer, ptr [[TMP1]], align 1
467+
; COST10-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
468+
; COST10-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
469+
; COST10-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
470+
; COST10: [[MIDDLE_BLOCK]]:
471+
; COST10-NEXT: br i1 false, [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
472+
; COST10: [[VEC_EPILOG_ITER_CHECK]]:
473+
; COST10-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF4]]
474+
; COST10: [[VEC_EPILOG_PH]]:
475+
; COST10-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
476+
; COST10-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
477+
; COST10-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
478+
; COST10-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
479+
; COST10-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
480+
; COST10: [[VEC_EPILOG_VECTOR_BODY]]:
481+
; COST10-NEXT: [[INDEX1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
482+
; COST10-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
483+
; COST10-NEXT: [[TMP3:%.*]] = mul <4 x i64> [[VEC_IND]], splat (i64 3)
484+
; COST10-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
485+
; COST10-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
486+
; COST10-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
487+
; COST10-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
488+
; COST10-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]]
489+
; COST10-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP5]]
490+
; COST10-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
491+
; COST10-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
492+
; COST10-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 2
493+
; COST10-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP9]], i64 2
494+
; COST10-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 2
495+
; COST10-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP11]], i64 2
496+
; COST10-NEXT: store i8 0, ptr [[TMP12]], align 1
497+
; COST10-NEXT: store i8 0, ptr [[TMP13]], align 1
498+
; COST10-NEXT: store i8 0, ptr [[TMP14]], align 1
499+
; COST10-NEXT: store i8 0, ptr [[TMP15]], align 1
500+
; COST10-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP8]], i64 1
501+
; COST10-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP9]], i64 1
502+
; COST10-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP10]], i64 1
503+
; COST10-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP11]], i64 1
504+
; COST10-NEXT: store i8 0, ptr [[TMP16]], align 1
505+
; COST10-NEXT: store i8 0, ptr [[TMP17]], align 1
506+
; COST10-NEXT: store i8 0, ptr [[TMP18]], align 1
507+
; COST10-NEXT: store i8 0, ptr [[TMP19]], align 1
508+
; COST10-NEXT: store i8 0, ptr [[TMP8]], align 1
509+
; COST10-NEXT: store i8 0, ptr [[TMP9]], align 1
510+
; COST10-NEXT: store i8 0, ptr [[TMP10]], align 1
511+
; COST10-NEXT: store i8 0, ptr [[TMP11]], align 1
512+
; COST10-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
513+
; COST10-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
514+
; COST10-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 100
515+
; COST10-NEXT: br i1 [[TMP20]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
516+
; COST10: [[VEC_EPILOG_MIDDLE_BLOCK]]:
517+
; COST10-NEXT: br i1 false, [[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
518+
; COST10: [[VEC_EPILOG_SCALAR_PH]]:
519+
;
520+
entry:
521+
br label %loop
522+
523+
loop:
524+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
525+
%iv.3 = mul i64 %iv, 3
526+
%gep.0 = getelementptr i8, ptr %dst, i64 %iv.3
527+
%gep.2 = getelementptr i8, ptr %gep.0, i64 2
528+
store i8 0, ptr %gep.2, align 1
529+
%gep.1 = getelementptr i8, ptr %gep.0, i64 1
530+
store i8 0, ptr %gep.1, align 1
531+
store i8 0, ptr %gep.0, align 1
532+
%iv.next = add i64 %iv, 1
533+
%ec = icmp eq i64 %iv, 100
534+
br i1 %ec, label %exit, label %loop
535+
536+
exit:
537+
ret void
538+
}
539+
383540
attributes #0 = { "target-features"="+neon,+sve" vscale_range(1,16) }
541+
attributes #1 = { "target-cpu"="neoverse-512tvb" }
384542

385543
declare void @llvm.assume(i1 noundef)
386544
declare i64 @llvm.umin.i64(i64, i64)

0 commit comments

Comments
 (0)