@@ -380,7 +380,165 @@ for.end:
380380 ret void
381381}
382382
383+ define void @interleave_group (ptr %dst ) #1 {
384+ ; COST1-LABEL: define void @interleave_group(
385+ ; COST1-SAME: ptr [[DST:%.*]]) #[[ATTR1:[0-9]+]] {
386+ ; COST1-NEXT: [[ITER_CHECK:.*:]]
387+ ; COST1-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
388+ ; COST1: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
389+ ; COST1-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
390+ ; COST1: [[VECTOR_PH]]:
391+ ; COST1-NEXT: br label %[[VECTOR_BODY:.*]]
392+ ; COST1: [[VECTOR_BODY]]:
393+ ; COST1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
394+ ; COST1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 16
395+ ; COST1-NEXT: [[TMP1:%.*]] = mul i64 [[INDEX]], 3
396+ ; COST1-NEXT: [[TMP2:%.*]] = mul i64 [[TMP0]], 3
397+ ; COST1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP1]]
398+ ; COST1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]]
399+ ; COST1-NEXT: store <48 x i8> zeroinitializer, ptr [[TMP3]], align 1
400+ ; COST1-NEXT: store <48 x i8> zeroinitializer, ptr [[TMP4]], align 1
401+ ; COST1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
402+ ; COST1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
403+ ; COST1-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
404+ ; COST1: [[MIDDLE_BLOCK]]:
405+ ; COST1-NEXT: br i1 false, [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
406+ ; COST1: [[VEC_EPILOG_ITER_CHECK]]:
407+ ; COST1-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF4]]
408+ ; COST1: [[VEC_EPILOG_PH]]:
409+ ; COST1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
410+ ; COST1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
411+ ; COST1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
412+ ; COST1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
413+ ; COST1-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
414+ ; COST1: [[VEC_EPILOG_VECTOR_BODY]]:
415+ ; COST1-NEXT: [[INDEX1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
416+ ; COST1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
417+ ; COST1-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[VEC_IND]], splat (i64 3)
418+ ; COST1-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
419+ ; COST1-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
420+ ; COST1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
421+ ; COST1-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
422+ ; COST1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
423+ ; COST1-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
424+ ; COST1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP9]]
425+ ; COST1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP10]]
426+ ; COST1-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP11]], i64 2
427+ ; COST1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP12]], i64 2
428+ ; COST1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 2
429+ ; COST1-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 2
430+ ; COST1-NEXT: store i8 0, ptr [[TMP15]], align 1
431+ ; COST1-NEXT: store i8 0, ptr [[TMP16]], align 1
432+ ; COST1-NEXT: store i8 0, ptr [[TMP17]], align 1
433+ ; COST1-NEXT: store i8 0, ptr [[TMP18]], align 1
434+ ; COST1-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP11]], i64 1
435+ ; COST1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP12]], i64 1
436+ ; COST1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP13]], i64 1
437+ ; COST1-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[TMP14]], i64 1
438+ ; COST1-NEXT: store i8 0, ptr [[TMP19]], align 1
439+ ; COST1-NEXT: store i8 0, ptr [[TMP20]], align 1
440+ ; COST1-NEXT: store i8 0, ptr [[TMP21]], align 1
441+ ; COST1-NEXT: store i8 0, ptr [[TMP22]], align 1
442+ ; COST1-NEXT: store i8 0, ptr [[TMP11]], align 1
443+ ; COST1-NEXT: store i8 0, ptr [[TMP12]], align 1
444+ ; COST1-NEXT: store i8 0, ptr [[TMP13]], align 1
445+ ; COST1-NEXT: store i8 0, ptr [[TMP14]], align 1
446+ ; COST1-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
447+ ; COST1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
448+ ; COST1-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 100
449+ ; COST1-NEXT: br i1 [[TMP23]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
450+ ; COST1: [[VEC_EPILOG_MIDDLE_BLOCK]]:
451+ ; COST1-NEXT: br i1 false, [[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
452+ ; COST1: [[VEC_EPILOG_SCALAR_PH]]:
453+ ;
454+ ; COST10-LABEL: define void @interleave_group(
455+ ; COST10-SAME: ptr [[DST:%.*]]) #[[ATTR1:[0-9]+]] {
456+ ; COST10-NEXT: [[ITER_CHECK:.*:]]
457+ ; COST10-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
458+ ; COST10: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
459+ ; COST10-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
460+ ; COST10: [[VECTOR_PH]]:
461+ ; COST10-NEXT: br label %[[VECTOR_BODY:.*]]
462+ ; COST10: [[VECTOR_BODY]]:
463+ ; COST10-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
464+ ; COST10-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 3
465+ ; COST10-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
466+ ; COST10-NEXT: store <48 x i8> zeroinitializer, ptr [[TMP1]], align 1
467+ ; COST10-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
468+ ; COST10-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
469+ ; COST10-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
470+ ; COST10: [[MIDDLE_BLOCK]]:
471+ ; COST10-NEXT: br i1 false, [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
472+ ; COST10: [[VEC_EPILOG_ITER_CHECK]]:
473+ ; COST10-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF4]]
474+ ; COST10: [[VEC_EPILOG_PH]]:
475+ ; COST10-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
476+ ; COST10-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
477+ ; COST10-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
478+ ; COST10-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
479+ ; COST10-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
480+ ; COST10: [[VEC_EPILOG_VECTOR_BODY]]:
481+ ; COST10-NEXT: [[INDEX1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
482+ ; COST10-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
483+ ; COST10-NEXT: [[TMP3:%.*]] = mul <4 x i64> [[VEC_IND]], splat (i64 3)
484+ ; COST10-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
485+ ; COST10-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
486+ ; COST10-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
487+ ; COST10-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
488+ ; COST10-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]]
489+ ; COST10-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP5]]
490+ ; COST10-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
491+ ; COST10-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
492+ ; COST10-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 2
493+ ; COST10-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP9]], i64 2
494+ ; COST10-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 2
495+ ; COST10-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP11]], i64 2
496+ ; COST10-NEXT: store i8 0, ptr [[TMP12]], align 1
497+ ; COST10-NEXT: store i8 0, ptr [[TMP13]], align 1
498+ ; COST10-NEXT: store i8 0, ptr [[TMP14]], align 1
499+ ; COST10-NEXT: store i8 0, ptr [[TMP15]], align 1
500+ ; COST10-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP8]], i64 1
501+ ; COST10-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP9]], i64 1
502+ ; COST10-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP10]], i64 1
503+ ; COST10-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP11]], i64 1
504+ ; COST10-NEXT: store i8 0, ptr [[TMP16]], align 1
505+ ; COST10-NEXT: store i8 0, ptr [[TMP17]], align 1
506+ ; COST10-NEXT: store i8 0, ptr [[TMP18]], align 1
507+ ; COST10-NEXT: store i8 0, ptr [[TMP19]], align 1
508+ ; COST10-NEXT: store i8 0, ptr [[TMP8]], align 1
509+ ; COST10-NEXT: store i8 0, ptr [[TMP9]], align 1
510+ ; COST10-NEXT: store i8 0, ptr [[TMP10]], align 1
511+ ; COST10-NEXT: store i8 0, ptr [[TMP11]], align 1
512+ ; COST10-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
513+ ; COST10-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
514+ ; COST10-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 100
515+ ; COST10-NEXT: br i1 [[TMP20]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
516+ ; COST10: [[VEC_EPILOG_MIDDLE_BLOCK]]:
517+ ; COST10-NEXT: br i1 false, [[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
518+ ; COST10: [[VEC_EPILOG_SCALAR_PH]]:
519+ ;
520+ entry:
521+ br label %loop
522+
523+ loop:
524+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %loop ]
525+ %iv.3 = mul i64 %iv , 3
526+ %gep.0 = getelementptr i8 , ptr %dst , i64 %iv.3
527+ %gep.2 = getelementptr i8 , ptr %gep.0 , i64 2
528+ store i8 0 , ptr %gep.2 , align 1
529+ %gep.1 = getelementptr i8 , ptr %gep.0 , i64 1
530+ store i8 0 , ptr %gep.1 , align 1
531+ store i8 0 , ptr %gep.0 , align 1
532+ %iv.next = add i64 %iv , 1
533+ %ec = icmp eq i64 %iv , 100
534+ br i1 %ec , label %exit , label %loop
535+
536+ exit:
537+ ret void
538+ }
539+
383540attributes #0 = { "target-features" ="+neon,+sve" vscale_range(1 ,16 ) }
541+ attributes #1 = { "target-cpu" ="neoverse-512tvb" }
384542
385543declare void @llvm.assume (i1 noundef)
386544declare i64 @llvm.umin.i64 (i64 , i64 )
0 commit comments