@@ -335,20 +335,20 @@ define i32 @add_i8_i32(ptr nocapture readonly %x, i32 %n) #0 {
335335; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
336336; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
337337; CHECK: vector.ph:
338- ; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 7
339- ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
338+ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 15
339+ ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
340340; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
341341; CHECK: vector.body:
342342; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
343343; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
344- ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1 .i32(i32 [[INDEX]], i32 [[N]])
344+ ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1 .i32(i32 [[INDEX]], i32 [[N]])
345345; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
346- ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8 .p0(ptr [[TMP0]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
347- ; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32>
348- ; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP1]], <8 x i32> zeroinitializer
349- ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
346+ ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8 .p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
347+ ; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
348+ ; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP1]], <16 x i32> zeroinitializer
349+ ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]])
350350; CHECK-NEXT: [[TMP4]] = add i32 [[TMP3]], [[VEC_PHI]]
351- ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
351+ ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
352352; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
353353; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
354354; CHECK: for.cond.cleanup:
@@ -1403,21 +1403,21 @@ define i32 @mla_i8_i32_multiuse(ptr nocapture readonly %x, ptr nocapture readonl
14031403; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
14041404; CHECK-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
14051405; CHECK: vector.ph:
1406- ; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 7
1407- ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
1406+ ; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 15
1407+ ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
14081408; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
14091409; CHECK: vector.body:
14101410; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
14111411; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
1412- ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1 .i32(i32 [[INDEX]], i32 [[N]])
1412+ ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1 .i32(i32 [[INDEX]], i32 [[N]])
14131413; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
1414- ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8 .p0(ptr [[TMP0]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
1415- ; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32>
1416- ; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i32> [[TMP1]], [[TMP1]]
1417- ; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP2]], <8 x i32> zeroinitializer
1418- ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]])
1414+ ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8 .p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
1415+ ; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
1416+ ; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP1]], [[TMP1]]
1417+ ; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP2]], <16 x i32> zeroinitializer
1418+ ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]])
14191419; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]]
1420- ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
1420+ ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
14211421; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
14221422; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
14231423; CHECK: for.cond.cleanup:
@@ -1519,25 +1519,25 @@ define i64 @mla_and_add_together_16_64(ptr nocapture noundef readonly %x, i32 no
15191519; CHECK-NEXT: entry:
15201520; CHECK-NEXT: [[CMP16:%.*]] = icmp sgt i32 [[N:%.*]], 0
15211521; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP16]])
1522- ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i32 [[N]], 4
1522+ ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i32 [[N]], 8
15231523; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
15241524; CHECK: vector.ph:
1525- ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644
1525+ ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483640
15261526; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
15271527; CHECK: vector.body:
15281528; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
15291529; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
15301530; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
15311531; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
1532- ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 2
1533- ; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
1534- ; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[TMP1]], [[TMP1]]
1535- ; CHECK-NEXT: [[TMP3:%.*]] = zext nneg <4 x i32> [[TMP2]] to <4 x i64>
1536- ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP3]])
1532+ ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
1533+ ; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
1534+ ; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i32> [[TMP1]], [[TMP1]]
1535+ ; CHECK-NEXT: [[TMP3:%.*]] = zext nneg <8 x i32> [[TMP2]] to <8 x i64>
1536+ ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]])
15371537; CHECK-NEXT: [[TMP5]] = add i64 [[TMP4]], [[VEC_PHI]]
1538- ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
1538+ ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]])
15391539; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI1]]
1540- ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
1540+ ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
15411541; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
15421542; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
15431543; CHECK: middle.block:
0 commit comments