Skip to content

Commit 9c0e09e

Browse files
committed
[VPlan] Process ExpressionRecipes in reverse order in constructor.
Currently there's a crash when trying to construct VPExpressionRecipes for a mul (ext, ext), if the multiply has outside users; the mul will be cloned to serve its external users, but the extends won't get cloned and will stay connected to users outside the loop (the cloned multiply). To fix this, process recipes in reverse order. This ensures that we visit bundled users before their operands, properly ensuring that the extends for the external user are cloned as well.
1 parent 58ce3e2 commit 9c0e09e

File tree

2 files changed

+122
-1
lines changed

2 files changed

+122
-1
lines changed

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2777,7 +2777,7 @@ VPExpressionRecipe::VPExpressionRecipe(
27772777
// Recipes in the expression, except the last one, must only be used by
27782778
// (other) recipes inside the expression. If there are other users, external
27792779
// to the expression, use a clone of the recipe for external users.
2780-
for (VPSingleDefRecipe *R : ExpressionRecipes) {
2780+
for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
27812781
if (R != ExpressionRecipes.back() &&
27822782
any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
27832783
return !ExpressionRecipesAsSetOfUsers.contains(U);
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
2+
; RUN: opt -p loop-vectorize -prefer-inloop-reductions -mcpu=apple-m1 -force-vector-interleave=1 -S %s | FileCheck %s
3+
4+
target triple = "arm64-apple-macosx"
5+
6+
define i32 @mul_used_outside_vpexpression(ptr %src.0, ptr %src.1) {
7+
; CHECK-LABEL: define i32 @mul_used_outside_vpexpression(
8+
; CHECK-SAME: ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]]) #[[ATTR0:[0-9]+]] {
9+
; CHECK-NEXT: [[ITER_CHECK:.*]]:
10+
; CHECK-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
11+
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
12+
; CHECK-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
13+
; CHECK: [[VECTOR_PH]]:
14+
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[SRC_1]], i64 1
15+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
16+
; CHECK: [[VECTOR_BODY]]:
17+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
18+
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
19+
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
20+
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC_0]], i64 [[INDEX]]
21+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
22+
; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[TMP0]], align 1
23+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[TMP1]], i64 0
24+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
25+
; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
26+
; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
27+
; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP2]], [[TMP3]]
28+
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
29+
; CHECK-NEXT: [[TMP6]] = add i32 [[VEC_PHI]], [[TMP5]]
30+
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP4]])
31+
; CHECK-NEXT: [[TMP8]] = or i32 [[VEC_PHI1]], [[TMP7]]
32+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
33+
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
34+
; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
35+
; CHECK: [[MIDDLE_BLOCK]]:
36+
; CHECK-NEXT: br i1 false, label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
37+
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
38+
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[SRC_0]], i64 96
39+
; CHECK-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
40+
; CHECK: [[VEC_EPILOG_PH]]:
41+
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
42+
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
43+
; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i32 [ [[TMP8]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
44+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[SRC_0]], i64 100
45+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[SRC_1]], i64 1
46+
; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
47+
; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
48+
; CHECK-NEXT: [[INDEX3:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
49+
; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
50+
; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi i32 [ [[BC_MERGE_RDX2]], %[[VEC_EPILOG_PH]] ], [ [[TMP19:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
51+
; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[SRC_0]], i64 [[INDEX3]]
52+
; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[NEXT_GEP6]], align 1
53+
; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[TMP11]], align 1
54+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i64 0
55+
; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT8]], <4 x i8> poison, <4 x i32> zeroinitializer
56+
; CHECK-NEXT: [[TMP13:%.*]] = zext <4 x i8> [[WIDE_LOAD7]] to <4 x i32>
57+
; CHECK-NEXT: [[TMP14:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT9]] to <4 x i32>
58+
; CHECK-NEXT: [[TMP15:%.*]] = mul <4 x i32> [[TMP13]], [[TMP14]]
59+
; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
60+
; CHECK-NEXT: [[TMP17]] = add i32 [[VEC_PHI4]], [[TMP16]]
61+
; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP15]])
62+
; CHECK-NEXT: [[TMP19]] = or i32 [[VEC_PHI5]], [[TMP18]]
63+
; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX3]], 4
64+
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT10]], 100
65+
; CHECK-NEXT: br i1 [[TMP20]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
66+
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
67+
; CHECK-NEXT: br i1 false, label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
68+
; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
69+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 100, %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
70+
; CHECK-NEXT: [[BC_RESUME_VAL11:%.*]] = phi ptr [ [[TMP10]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[SRC_0]], %[[ITER_CHECK]] ]
71+
; CHECK-NEXT: [[BC_MERGE_RDX12:%.*]] = phi i32 [ [[TMP17]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP6]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
72+
; CHECK-NEXT: [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP19]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP8]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
73+
; CHECK-NEXT: br label %[[LOOP:.*]]
74+
; CHECK: [[LOOP]]:
75+
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
76+
; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL11]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[GEP_0:%.*]], %[[LOOP]] ]
77+
; CHECK-NEXT: [[RED_0:%.*]] = phi i32 [ [[BC_MERGE_RDX12]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_0_NEXT:%.*]], %[[LOOP]] ]
78+
; CHECK-NEXT: [[RED_1:%.*]] = phi i32 [ [[BC_MERGE_RDX13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_1_NEXT:%.*]], %[[LOOP]] ]
79+
; CHECK-NEXT: [[GEP_0]] = getelementptr i8, ptr [[PTR_IV]], i64 1
80+
; CHECK-NEXT: [[L_0:%.*]] = load i8, ptr [[PTR_IV]], align 1
81+
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[SRC_1]], i64 1
82+
; CHECK-NEXT: [[L_1:%.*]] = load i8, ptr [[GEP_1]], align 1
83+
; CHECK-NEXT: [[L_0_EXT:%.*]] = zext i8 [[L_0]] to i32
84+
; CHECK-NEXT: [[L_1_EXT:%.*]] = zext i8 [[L_1]] to i32
85+
; CHECK-NEXT: [[MUL_EXT_LL:%.*]] = mul i32 [[L_0_EXT]], [[L_1_EXT]]
86+
; CHECK-NEXT: [[RED_1_NEXT]] = or i32 [[MUL_EXT_LL]], [[RED_1]]
87+
; CHECK-NEXT: [[RED_0_NEXT]] = add i32 [[MUL_EXT_LL]], [[RED_0]]
88+
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
89+
; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 101
90+
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
91+
; CHECK: [[EXIT]]:
92+
; CHECK-NEXT: [[RED_1_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_1_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[TMP19]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
93+
; CHECK-NEXT: [[RED_0_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_0_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ [[TMP17]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
94+
; CHECK-NEXT: [[RES:%.*]] = add i32 [[RED_1_NEXT_LCSSA]], [[RED_0_NEXT_LCSSA]]
95+
; CHECK-NEXT: ret i32 [[RES]]
96+
;
97+
entry:
98+
br label %loop
99+
100+
loop:
101+
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
102+
%ptr.iv = phi ptr [ %src.0, %entry ], [ %gep.0, %loop ]
103+
%red.0 = phi i32 [ 0, %entry ], [ %red.0.next, %loop ]
104+
%red.1 = phi i32 [ 0, %entry ], [ %red.1.next, %loop ]
105+
%gep.0 = getelementptr i8, ptr %ptr.iv, i64 1
106+
%l.0 = load i8, ptr %ptr.iv, align 1
107+
%gep.1 = getelementptr i8, ptr %src.1, i64 1
108+
%l.1 = load i8, ptr %gep.1, align 1
109+
%l.0.ext = zext i8 %l.0 to i32
110+
%l.1.ext = zext i8 %l.1 to i32
111+
%mul.ext.ll = mul i32 %l.0.ext, %l.1.ext
112+
%red.1.next = or i32 %mul.ext.ll, %red.1
113+
%red.0.next = add i32 %mul.ext.ll, %red.0
114+
%iv.next = add i32 %iv, 1
115+
%ec = icmp eq i32 %iv, 101
116+
br i1 %ec, label %exit, label %loop
117+
118+
exit:
119+
%res = add i32 %red.1.next, %red.0.next
120+
ret i32 %res
121+
}

0 commit comments

Comments
 (0)