Skip to content

Commit 661c387

Browse files
fhahntstellar
authored andcommitted
release/21.x: [VPlan] Don't narrow op multiple times in narrowInterleaveGroups.
Track which ops already have been narrowed, to avoid narrowing the same operation multiple times. Repeated narrowing will lead to incorrect results, because we could first narrow from an interleave group -> wide load, and then narrow the wide load > single-scalar load. Fixes thttps://github.com/llvm/issues/156190.
1 parent f5c1b52 commit 661c387

File tree

3 files changed

+158
-2
lines changed

3 files changed

+158
-2
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3252,9 +3252,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
32523252
return;
32533253

32543254
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
3255-
auto NarrowOp = [](VPValue *V) -> VPValue * {
3255+
SmallPtrSet<VPValue *, 4> NarrowedOps;
3256+
auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
32563257
auto *R = V->getDefiningRecipe();
3257-
if (!R)
3258+
if (!R || NarrowedOps.contains(V))
32583259
return V;
32593260
if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
32603261
// Narrow interleave group to wide load, as transformed VPlan will only
@@ -3264,13 +3265,15 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
32643265
LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
32653266
/*Reverse=*/false, {}, LoadGroup->getDebugLoc());
32663267
L->insertBefore(LoadGroup);
3268+
NarrowedOps.insert(L);
32673269
return L;
32683270
}
32693271

32703272
if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
32713273
assert(RepR->isSingleScalar() &&
32723274
isa<LoadInst>(RepR->getUnderlyingInstr()) &&
32733275
"must be a single scalar load");
3276+
NarrowedOps.insert(RepR);
32743277
return RepR;
32753278
}
32763279
auto *WideLoad = cast<VPWidenLoadRecipe>(R);
@@ -3281,6 +3284,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
32813284
WideLoad->operands(), /*IsUniform*/ true,
32823285
/*Mask*/ nullptr, *WideLoad);
32833286
N->insertBefore(WideLoad);
3287+
NarrowedOps.insert(N);
32843288
return N;
32853289
};
32863290

llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1203,3 +1203,82 @@ loop:
12031203
exit:
12041204
ret void
12051205
}
1206+
1207+
; Make sure multiple uses of a narrowed op are handled correctly,
1208+
; https://github.com/llvm/llvm-project/issues/156190.
1209+
define void @multiple_store_groups_storing_same_wide_bin_op(ptr noalias %A, ptr noalias %B, ptr noalias %C) {
1210+
; VF2-LABEL: define void @multiple_store_groups_storing_same_wide_bin_op(
1211+
; VF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
1212+
; VF2-NEXT: [[ENTRY:.*:]]
1213+
; VF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
1214+
; VF2: [[VECTOR_PH]]:
1215+
; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
1216+
; VF2: [[VECTOR_BODY]]:
1217+
; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
1218+
; VF2-NEXT: [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 [[INDEX]]
1219+
; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
1220+
; VF2-NEXT: [[TMP2:%.*]] = fadd contract <2 x double> [[BROADCAST_SPLAT]], splat (double 2.000000e+01)
1221+
; VF2-NEXT: [[TMP3:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
1222+
; VF2-NEXT: store <2 x double> [[TMP2]], ptr [[TMP3]], align 8
1223+
; VF2-NEXT: [[TMP4:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
1224+
; VF2-NEXT: store <2 x double> [[TMP2]], ptr [[TMP4]], align 8
1225+
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
1226+
; VF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
1227+
; VF2-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
1228+
; VF2: [[MIDDLE_BLOCK]]:
1229+
; VF2-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
1230+
; VF2: [[SCALAR_PH]]:
1231+
;
1232+
; VF4-LABEL: define void @multiple_store_groups_storing_same_wide_bin_op(
1233+
; VF4-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
1234+
; VF4-NEXT: [[ENTRY:.*:]]
1235+
; VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
1236+
; VF4: [[VECTOR_PH]]:
1237+
; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
1238+
; VF4: [[VECTOR_BODY]]:
1239+
; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
1240+
; VF4-NEXT: [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 [[INDEX]]
1241+
; VF4-NEXT: [[WIDE_VEC:%.*]] = load <8 x double>, ptr [[TMP0]], align 8
1242+
; VF4-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1243+
; VF4-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1244+
; VF4-NEXT: [[TMP1:%.*]] = fadd contract <4 x double> [[STRIDED_VEC]], splat (double 2.000000e+01)
1245+
; VF4-NEXT: [[TMP2:%.*]] = fadd contract <4 x double> [[STRIDED_VEC1]], splat (double 2.000000e+01)
1246+
; VF4-NEXT: [[TMP3:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
1247+
; VF4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1248+
; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1249+
; VF4-NEXT: store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
1250+
; VF4-NEXT: [[TMP5:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
1251+
; VF4-NEXT: store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
1252+
; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1253+
; VF4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
1254+
; VF4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
1255+
; VF4: [[MIDDLE_BLOCK]]:
1256+
; VF4-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
1257+
; VF4: [[SCALAR_PH]]:
1258+
;
1259+
entry:
1260+
br label %loop
1261+
1262+
loop:
1263+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
1264+
%gep.A = getelementptr { double, double }, ptr %A, i64 %iv
1265+
%l.A.0 = load double, ptr %gep.A, align 8
1266+
%gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A, i64 8
1267+
%l.A.1 = load double, ptr %gep.A.1, align 8
1268+
%add.0 = fadd contract double %l.A.0, 20.0
1269+
%add.1 = fadd contract double %l.A.1, 20.0
1270+
%gep.B = getelementptr { double, double }, ptr %B, i64 %iv
1271+
store double %add.0, ptr %gep.B, align 8
1272+
%gep.B.1 = getelementptr inbounds nuw i8, ptr %gep.B, i64 8
1273+
store double %add.1, ptr %gep.B.1, align 8
1274+
%gep.C = getelementptr { double, double }, ptr %C, i64 %iv
1275+
%gep.C.1 = getelementptr inbounds nuw i8, ptr %gep.C, i64 8
1276+
store double %add.0, ptr %gep.C, align 8
1277+
store double %add.1, ptr %gep.C.1, align 8
1278+
%iv.next = add nuw nsw i64 %iv, 1
1279+
%.not = icmp eq i64 %iv.next, 1000
1280+
br i1 %.not, label %exit, label %loop
1281+
1282+
exit:
1283+
ret void
1284+
}

llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -587,3 +587,76 @@ loop:
587587
exit:
588588
ret void
589589
}
590+
591+
define void @multiple_store_groups_storing_same_load_group(ptr noalias %A, ptr noalias %B, ptr noalias %C) {
592+
; VF2-LABEL: define void @multiple_store_groups_storing_same_load_group(
593+
; VF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
594+
; VF2-NEXT: [[ENTRY:.*:]]
595+
; VF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
596+
; VF2: [[VECTOR_PH]]:
597+
; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
598+
; VF2: [[VECTOR_BODY]]:
599+
; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
600+
; VF2-NEXT: [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 [[INDEX]]
601+
; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
602+
; VF2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
603+
; VF2-NEXT: [[TMP1:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
604+
; VF2-NEXT: store <2 x double> [[WIDE_LOAD]], ptr [[TMP1]], align 8
605+
; VF2-NEXT: [[TMP2:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
606+
; VF2-NEXT: store <2 x double> [[WIDE_LOAD1]], ptr [[TMP2]], align 8
607+
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
608+
; VF2-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
609+
; VF2-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
610+
; VF2: [[MIDDLE_BLOCK]]:
611+
; VF2-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
612+
; VF2: [[SCALAR_PH]]:
613+
;
614+
; VF4-LABEL: define void @multiple_store_groups_storing_same_load_group(
615+
; VF4-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
616+
; VF4-NEXT: [[ENTRY:.*:]]
617+
; VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
618+
; VF4: [[VECTOR_PH]]:
619+
; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
620+
; VF4: [[VECTOR_BODY]]:
621+
; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
622+
; VF4-NEXT: [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 [[INDEX]]
623+
; VF4-NEXT: [[WIDE_VEC:%.*]] = load <8 x double>, ptr [[TMP0]], align 8
624+
; VF4-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
625+
; VF4-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
626+
; VF4-NEXT: [[TMP1:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
627+
; VF4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[STRIDED_VEC]], <4 x double> [[STRIDED_VEC1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
628+
; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
629+
; VF4-NEXT: store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
630+
; VF4-NEXT: [[TMP3:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
631+
; VF4-NEXT: store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
632+
; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
633+
; VF4-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
634+
; VF4-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
635+
; VF4: [[MIDDLE_BLOCK]]:
636+
; VF4-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
637+
; VF4: [[SCALAR_PH]]:
638+
;
639+
entry:
640+
br label %loop
641+
642+
loop:
643+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
644+
%gep.A = getelementptr { double, double }, ptr %A, i64 %iv
645+
%gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A, i64 8
646+
%l.A.0 = load double, ptr %gep.A, align 8
647+
%l.A.1 = load double, ptr %gep.A.1, align 8
648+
%gep.B = getelementptr { double, double }, ptr %B, i64 %iv
649+
%gep.B.1 = getelementptr inbounds nuw i8, ptr %gep.B, i64 8
650+
store double %l.A.0, ptr %gep.B, align 8
651+
store double %l.A.1, ptr %gep.B.1, align 8
652+
%gep.C = getelementptr { double, double }, ptr %C, i64 %iv
653+
%gep.C.1 = getelementptr inbounds nuw i8, ptr %gep.C, i64 8
654+
store double %l.A.0, ptr %gep.C, align 8
655+
store double %l.A.1, ptr %gep.C.1, align 8
656+
%iv.next = add nuw nsw i64 %iv, 1
657+
%.not = icmp eq i64 %iv.next, 1000
658+
br i1 %.not, label %exit, label %loop
659+
660+
exit:
661+
ret void
662+
}

0 commit comments

Comments
 (0)