-
Notifications
You must be signed in to change notification settings - Fork 15k
release/21.x: [VPlan] Don't narrow op multiple times in narrowInterleaveGroups. #158013
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…aveGroups. Track which ops already have been narrowed, to avoid narrowing the same operation multiple times. Repeated narrowing will lead to incorrect results, because we could first narrow from an interleave group -> wide load, and then narrow the wide load > single-scalar load. Fixes thttps://github.com/llvm/issues/156190.
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesTrack which ops already have been narrowed, to avoid narrowing the same operation multiple times. Repeated narrowing will lead to incorrect results, because we could first narrow from an interleave group -> wide load, and then narrow the wide load > single-scalar load. Fixes thttps://github.com//issues/156190. Full diff: https://github.com/llvm/llvm-project/pull/158013.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6a3b3e6e41955..f7c1c10185c68 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3252,9 +3252,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
return;
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
- auto NarrowOp = [](VPValue *V) -> VPValue * {
+ SmallPtrSet<VPValue *, 4> NarrowedOps;
+ auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
auto *R = V->getDefiningRecipe();
- if (!R)
+ if (!R || NarrowedOps.contains(V))
return V;
if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
// Narrow interleave group to wide load, as transformed VPlan will only
@@ -3264,6 +3265,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
/*Reverse=*/false, {}, LoadGroup->getDebugLoc());
L->insertBefore(LoadGroup);
+ NarrowedOps.insert(L);
return L;
}
@@ -3271,6 +3273,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
assert(RepR->isSingleScalar() &&
isa<LoadInst>(RepR->getUnderlyingInstr()) &&
"must be a single scalar load");
+ NarrowedOps.insert(RepR);
return RepR;
}
auto *WideLoad = cast<VPWidenLoadRecipe>(R);
@@ -3281,6 +3284,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
WideLoad->operands(), /*IsUniform*/ true,
/*Mask*/ nullptr, *WideLoad);
N->insertBefore(WideLoad);
+ NarrowedOps.insert(N);
return N;
};
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
index 813d61b52100f..aec6c0be6dde2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
@@ -1203,3 +1203,82 @@ loop:
exit:
ret void
}
+
+; Make sure multiple uses of a narrowed op are handled correctly,
+; https://github.com/llvm/llvm-project/issues/156190.
+define void @multiple_store_groups_storing_same_wide_bin_op(ptr noalias %A, ptr noalias %B, ptr noalias %C) {
+; VF2-LABEL: define void @multiple_store_groups_storing_same_wide_bin_op(
+; VF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; VF2-NEXT: [[ENTRY:.*:]]
+; VF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2: [[VECTOR_PH]]:
+; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF2: [[VECTOR_BODY]]:
+; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT: [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 [[INDEX]]
+; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; VF2-NEXT: [[TMP2:%.*]] = fadd contract <2 x double> [[BROADCAST_SPLAT]], splat (double 2.000000e+01)
+; VF2-NEXT: [[TMP3:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
+; VF2-NEXT: store <2 x double> [[TMP2]], ptr [[TMP3]], align 8
+; VF2-NEXT: [[TMP4:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
+; VF2-NEXT: store <2 x double> [[TMP2]], ptr [[TMP4]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; VF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF2-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; VF2: [[MIDDLE_BLOCK]]:
+; VF2-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2: [[SCALAR_PH]]:
+;
+; VF4-LABEL: define void @multiple_store_groups_storing_same_wide_bin_op(
+; VF4-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; VF4-NEXT: [[ENTRY:.*:]]
+; VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4: [[VECTOR_PH]]:
+; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF4: [[VECTOR_BODY]]:
+; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT: [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 [[INDEX]]
+; VF4-NEXT: [[WIDE_VEC:%.*]] = load <8 x double>, ptr [[TMP0]], align 8
+; VF4-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; VF4-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; VF4-NEXT: [[TMP1:%.*]] = fadd contract <4 x double> [[STRIDED_VEC]], splat (double 2.000000e+01)
+; VF4-NEXT: [[TMP2:%.*]] = fadd contract <4 x double> [[STRIDED_VEC1]], splat (double 2.000000e+01)
+; VF4-NEXT: [[TMP3:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
+; VF4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT: store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF4-NEXT: [[TMP5:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
+; VF4-NEXT: store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
+; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; VF4: [[MIDDLE_BLOCK]]:
+; VF4-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4: [[SCALAR_PH]]:
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %gep.A = getelementptr { double, double }, ptr %A, i64 %iv
+ %l.A.0 = load double, ptr %gep.A, align 8
+ %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A, i64 8
+ %l.A.1 = load double, ptr %gep.A.1, align 8
+ %add.0 = fadd contract double %l.A.0, 20.0
+ %add.1 = fadd contract double %l.A.1, 20.0
+ %gep.B = getelementptr { double, double }, ptr %B, i64 %iv
+ store double %add.0, ptr %gep.B, align 8
+ %gep.B.1 = getelementptr inbounds nuw i8, ptr %gep.B, i64 8
+ store double %add.1, ptr %gep.B.1, align 8
+ %gep.C = getelementptr { double, double }, ptr %C, i64 %iv
+ %gep.C.1 = getelementptr inbounds nuw i8, ptr %gep.C, i64 8
+ store double %add.0, ptr %gep.C, align 8
+ store double %add.1, ptr %gep.C.1, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %.not = icmp eq i64 %iv.next, 1000
+ br i1 %.not, label %exit, label %loop
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
index 6acd7989dbfd2..451cbd7601a85 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
@@ -587,3 +587,76 @@ loop:
exit:
ret void
}
+
+define void @multiple_store_groups_storing_same_load_group(ptr noalias %A, ptr noalias %B, ptr noalias %C) {
+; VF2-LABEL: define void @multiple_store_groups_storing_same_load_group(
+; VF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; VF2-NEXT: [[ENTRY:.*:]]
+; VF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2: [[VECTOR_PH]]:
+; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF2: [[VECTOR_BODY]]:
+; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT: [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 [[INDEX]]
+; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; VF2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; VF2-NEXT: [[TMP1:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
+; VF2-NEXT: store <2 x double> [[WIDE_LOAD]], ptr [[TMP1]], align 8
+; VF2-NEXT: [[TMP2:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
+; VF2-NEXT: store <2 x double> [[WIDE_LOAD1]], ptr [[TMP2]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; VF2-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF2-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; VF2: [[MIDDLE_BLOCK]]:
+; VF2-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2: [[SCALAR_PH]]:
+;
+; VF4-LABEL: define void @multiple_store_groups_storing_same_load_group(
+; VF4-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; VF4-NEXT: [[ENTRY:.*:]]
+; VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4: [[VECTOR_PH]]:
+; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF4: [[VECTOR_BODY]]:
+; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT: [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 [[INDEX]]
+; VF4-NEXT: [[WIDE_VEC:%.*]] = load <8 x double>, ptr [[TMP0]], align 8
+; VF4-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; VF4-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; VF4-NEXT: [[TMP1:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
+; VF4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[STRIDED_VEC]], <4 x double> [[STRIDED_VEC1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT: store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; VF4-NEXT: [[TMP3:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
+; VF4-NEXT: store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF4-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; VF4: [[MIDDLE_BLOCK]]:
+; VF4-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4: [[SCALAR_PH]]:
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %gep.A = getelementptr { double, double }, ptr %A, i64 %iv
+ %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A, i64 8
+ %l.A.0 = load double, ptr %gep.A, align 8
+ %l.A.1 = load double, ptr %gep.A.1, align 8
+ %gep.B = getelementptr { double, double }, ptr %B, i64 %iv
+ %gep.B.1 = getelementptr inbounds nuw i8, ptr %gep.B, i64 8
+ store double %l.A.0, ptr %gep.B, align 8
+ store double %l.A.1, ptr %gep.B.1, align 8
+ %gep.C = getelementptr { double, double }, ptr %C, i64 %iv
+ %gep.C.1 = getelementptr inbounds nuw i8, ptr %gep.C, i64 8
+ store double %l.A.0, ptr %gep.C, align 8
+ store double %l.A.1, ptr %gep.C.1, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %.not = icmp eq i64 %iv.next, 1000
+ br i1 %.not, label %exit, label %loop
+
+exit:
+ ret void
+}
|
|
Is there someone else that can or should review this? |
cc @ayalz / @alexey-bataev |
|
When I created the PR, the version check failed in CI. I am curious if I did something wrong when I created the PR? |
Nope that's mainly a reminder to me to bump the version in the release branch. Thanks for checking though. |
|
Merged: 661c387 |
Track which ops already have been narrowed, to avoid narrowing the same operation multiple times. Repeated narrowing will lead to incorrect results, because we could first narrow from an interleave group -> wide load, and then narrow the wide load > single-scalar load.
Fixes #156190.