@@ -115,20 +115,20 @@ define i64 @add_i16_i64(ptr nocapture readonly %x, i32 %n) #0 {
115115; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
116116; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
117117; CHECK: for.body.preheader:
118- ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8
118+ ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
119119; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
120120; CHECK: vector.ph:
121- ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483640
121+ ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644
122122; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
123123; CHECK: vector.body:
124124; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
125125; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
126126; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
127- ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
128- ; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
129- ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP1]])
127+ ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 2
128+ ; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i64>
129+ ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
130130; CHECK-NEXT: [[TMP3]] = add i64 [[TMP2]], [[VEC_PHI]]
131- ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
131+ ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
132132; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
133133; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
134134; CHECK: middle.block:
@@ -180,20 +180,20 @@ define i64 @add_i8_i64(ptr nocapture readonly %x, i32 %n) #0 {
180180; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
181181; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
182182; CHECK: for.body.preheader:
183- ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16
183+ ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
184184; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
185185; CHECK: vector.ph:
186- ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483632
186+ ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644
187187; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
188188; CHECK: vector.body:
189189; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
190190; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
191191; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
192- ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
193- ; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
194- ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP1]])
192+ ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
193+ ; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i64>
194+ ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
195195; CHECK-NEXT: [[TMP3]] = add i64 [[TMP2]], [[VEC_PHI]]
196- ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
196+ ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
197197; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
198198; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
199199; CHECK: middle.block:
0 commit comments