Skip to content

Commit d998620

Browse files
committed
[RISCV] Mark Sub/AddChainWithSubs as legal reduction types
We used to vectorize these scalably but after #147026 they were split out from RecurKind::Add into their own RecurKinds, and we didn't mark them as supported in isLegalToVectorizeReduction. This caused the loop vectorizer to drop the scalable VPlan because it thinks the reductions will be scalarized. This fixes it by just marking them as supported. Fixes #154554
1 parent d590382 commit d998620

File tree

2 files changed

+42
-55
lines changed

2 files changed

+42
-55
lines changed

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,8 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
372372

373373
switch (RdxDesc.getRecurrenceKind()) {
374374
case RecurKind::Add:
375+
case RecurKind::Sub:
376+
case RecurKind::AddChainWithSubs:
375377
case RecurKind::And:
376378
case RecurKind::Or:
377379
case RecurKind::Xor:

llvm/test/Transforms/LoopVectorize/RISCV/reductions.ll

Lines changed: 40 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -64,46 +64,40 @@ for.end: ; preds = %for.body, %entry
6464
define i32 @sub(ptr %a, i64 %n) {
6565
; CHECK-LABEL: define i32 @sub(
6666
; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
67-
; CHECK-NEXT: [[ENTRY:.*]]:
68-
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
69-
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
67+
; CHECK-NEXT: [[ENTRY:.*:]]
68+
; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
7069
; CHECK: [[VECTOR_PH]]:
71-
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
72-
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
7370
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
7471
; CHECK: [[VECTOR_BODY]]:
75-
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
76-
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ <i32 1024, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
77-
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
72+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
73+
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 1024, i32 0), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
74+
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
75+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
7876
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
79-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[TMP0]], i32 8
80-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
81-
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 4
82-
; CHECK-NEXT: [[TMP2]] = sub <8 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
83-
; CHECK-NEXT: [[TMP3]] = sub <8 x i32> [[VEC_PHI1]], [[WIDE_LOAD2]]
84-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
85-
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
86-
; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
77+
; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP0]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP1]])
78+
; CHECK-NEXT: [[TMP2:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[VP_OP_LOAD]]
79+
; CHECK-NEXT: [[TMP3]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP1]])
80+
; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1]] to i64
81+
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP4]], [[INDEX]]
82+
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
83+
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
84+
; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
8785
; CHECK: [[MIDDLE_BLOCK]]:
88-
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP3]], [[TMP2]]
89-
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
90-
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
91-
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
86+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP3]])
87+
; CHECK-NEXT: br label %[[EXIT:.*]]
9288
; CHECK: [[SCALAR_PH]]:
93-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
94-
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ 1024, %[[ENTRY]] ]
9589
; CHECK-NEXT: br label %[[LOOP:.*]]
9690
; CHECK: [[LOOP]]:
97-
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
98-
; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUB:%.*]], %[[LOOP]] ]
91+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
92+
; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ 1024, %[[SCALAR_PH]] ], [ [[SUB:%.*]], %[[LOOP]] ]
9993
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
10094
; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[GEP]], align 4
10195
; CHECK-NEXT: [[SUB]] = sub i32 [[RDX]], [[X]]
10296
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
10397
; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
10498
; CHECK-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
10599
; CHECK: [[EXIT]]:
106-
; CHECK-NEXT: [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
100+
; CHECK-NEXT: [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
107101
; CHECK-NEXT: ret i32 [[SUB_LCSSA]]
108102
;
109103
entry:
@@ -126,44 +120,35 @@ exit:
126120
define i32 @addsub(ptr %a, ptr %b, i64 %n) {
127121
; CHECK-LABEL: define i32 @addsub(
128122
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
129-
; CHECK-NEXT: [[ENTRY:.*]]:
130-
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
131-
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
123+
; CHECK-NEXT: [[ENTRY:.*:]]
124+
; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
132125
; CHECK: [[VECTOR_PH]]:
133-
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
134-
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
135126
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
136127
; CHECK: [[VECTOR_BODY]]:
137-
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
138-
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
139-
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
128+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
129+
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
130+
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
131+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
140132
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
141-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[TMP0]], i32 8
142-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
143-
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 4
144-
; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
145-
; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[VEC_PHI1]], [[WIDE_LOAD2]]
133+
; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP0]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP1]])
134+
; CHECK-NEXT: [[TMP2:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[VP_OP_LOAD]]
146135
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
147-
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 8
148-
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
149-
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
150-
; CHECK-NEXT: [[TMP6]] = sub <8 x i32> [[TMP2]], [[WIDE_LOAD3]]
151-
; CHECK-NEXT: [[TMP7]] = sub <8 x i32> [[TMP3]], [[WIDE_LOAD4]]
152-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
153-
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
154-
; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
136+
; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP1]])
137+
; CHECK-NEXT: [[TMP9:%.*]] = sub <vscale x 4 x i32> [[TMP2]], [[VP_OP_LOAD1]]
138+
; CHECK-NEXT: [[TMP5]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP9]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP1]])
139+
; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[TMP1]] to i64
140+
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP6]], [[INDEX]]
141+
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP6]]
142+
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
143+
; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
155144
; CHECK: [[MIDDLE_BLOCK]]:
156-
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP7]], [[TMP6]]
157-
; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
158-
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
159-
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
145+
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP5]])
146+
; CHECK-NEXT: br label %[[EXIT:.*]]
160147
; CHECK: [[SCALAR_PH]]:
161-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
162-
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
163148
; CHECK-NEXT: br label %[[LOOP:.*]]
164149
; CHECK: [[LOOP]]:
165-
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
166-
; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUB:%.*]], %[[LOOP]] ]
150+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
151+
; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[SUB:%.*]], %[[LOOP]] ]
167152
; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
168153
; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[GEP_A]], align 4
169154
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[RDX]], [[X]]
@@ -174,7 +159,7 @@ define i32 @addsub(ptr %a, ptr %b, i64 %n) {
174159
; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
175160
; CHECK-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
176161
; CHECK: [[EXIT]]:
177-
; CHECK-NEXT: [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB]], %[[LOOP]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
162+
; CHECK-NEXT: [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
178163
; CHECK-NEXT: ret i32 [[SUB_LCSSA]]
179164
;
180165
entry:

0 commit comments

Comments
 (0)