@@ -10,11 +10,11 @@ define i32 @dotp_z_s(ptr %a, ptr %b) #0 {
1010; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
1111; CHECK-NEXT: entry:
1212; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
13- ; CHECK-NEXT: [[TMP13 :%.*]] = mul i64 [[TMP0]], 16
13+ ; CHECK-NEXT: [[TMP1 :%.*]] = mul i64 [[TMP0]], 16
1414; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1515; CHECK: vector.ph:
16- ; CHECK-NEXT: [[TMP14 :%.*]] = call i64 @llvm.vscale.i64()
17- ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP14 ]], 16
16+ ; CHECK-NEXT: [[TMP2 :%.*]] = call i64 @llvm.vscale.i64()
17+ ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2 ]], 16
1818; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
1919; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
2020; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
@@ -24,21 +24,21 @@ define i32 @dotp_z_s(ptr %a, ptr %b) #0 {
2424; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2525; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
2626; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
27- ; CHECK-NEXT: [[TMP1 :%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
28- ; CHECK-NEXT: [[TMP2 :%.*]] = getelementptr i8, ptr [[TMP1 ]], i32 0
27+ ; CHECK-NEXT: [[TMP6 :%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
28+ ; CHECK-NEXT: [[TMP7 :%.*]] = getelementptr i8, ptr [[TMP6 ]], i32 0
2929; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
3030; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8
31- ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP1 ]], i64 [[TMP9]]
32- ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP2 ]], align 1
31+ ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6 ]], i64 [[TMP9]]
32+ ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7 ]], align 1
3333; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
3434; CHECK-NEXT: [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
3535; CHECK-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
36- ; CHECK-NEXT: [[TMP6 :%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
37- ; CHECK-NEXT: [[TMP7 :%.*]] = getelementptr i8, ptr [[TMP6 ]], i32 0
36+ ; CHECK-NEXT: [[TMP13 :%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
37+ ; CHECK-NEXT: [[TMP14 :%.*]] = getelementptr i8, ptr [[TMP13 ]], i32 0
3838; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
3939; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8
40- ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP6 ]], i64 [[TMP16]]
41- ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7 ]], align 1
40+ ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13 ]], i64 [[TMP16]]
41+ ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14 ]], align 1
4242; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
4343; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
4444; CHECK-NEXT: [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
@@ -60,48 +60,48 @@ define i32 @dotp_z_s(ptr %a, ptr %b) #0 {
6060; CHECK-NOI8MM-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
6161; CHECK-NOI8MM-NEXT: entry:
6262; CHECK-NOI8MM-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
63- ; CHECK-NOI8MM-NEXT: [[TMP13 :%.*]] = mul i64 [[TMP0]], 16
63+ ; CHECK-NOI8MM-NEXT: [[TMP1 :%.*]] = mul i64 [[TMP0]], 16
6464; CHECK-NOI8MM-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
6565; CHECK-NOI8MM: vector.ph:
66- ; CHECK-NOI8MM-NEXT: [[TMP14 :%.*]] = call i64 @llvm.vscale.i64()
67- ; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = mul i64 [[TMP14 ]], 16
66+ ; CHECK-NOI8MM-NEXT: [[TMP2 :%.*]] = call i64 @llvm.vscale.i64()
67+ ; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2 ]], 16
6868; CHECK-NOI8MM-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
6969; CHECK-NOI8MM-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
7070; CHECK-NOI8MM-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
7171; CHECK-NOI8MM-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
7272; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]]
7373; CHECK-NOI8MM: vector.body:
7474; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
75- ; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE :%.*]], [[VECTOR_BODY]] ]
76- ; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5 :%.*]], [[VECTOR_BODY]] ]
77- ; CHECK-NOI8MM-NEXT: [[TMP1 :%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
78- ; CHECK-NOI8MM-NEXT: [[TMP2 :%.*]] = getelementptr i8, ptr [[TMP1 ]], i32 0
75+ ; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22 :%.*]], [[VECTOR_BODY]] ]
76+ ; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23 :%.*]], [[VECTOR_BODY]] ]
77+ ; CHECK-NOI8MM-NEXT: [[TMP6 :%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
78+ ; CHECK-NOI8MM-NEXT: [[TMP7 :%.*]] = getelementptr i8, ptr [[TMP6 ]], i32 0
7979; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
8080; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8
81- ; CHECK-NOI8MM-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP1 ]], i64 [[TMP9]]
82- ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP2 ]], align 1
81+ ; CHECK-NOI8MM-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6 ]], i64 [[TMP9]]
82+ ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7 ]], align 1
8383; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
8484; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
8585; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
86- ; CHECK-NOI8MM-NEXT: [[TMP6 :%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
87- ; CHECK-NOI8MM-NEXT: [[TMP7 :%.*]] = getelementptr i8, ptr [[TMP6 ]], i32 0
86+ ; CHECK-NOI8MM-NEXT: [[TMP13 :%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
87+ ; CHECK-NOI8MM-NEXT: [[TMP14 :%.*]] = getelementptr i8, ptr [[TMP13 ]], i32 0
8888; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
8989; CHECK-NOI8MM-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8
90- ; CHECK-NOI8MM-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP6 ]], i64 [[TMP16]]
91- ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7 ]], align 1
90+ ; CHECK-NOI8MM-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13 ]], i64 [[TMP16]]
91+ ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14 ]], align 1
9292; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
9393; CHECK-NOI8MM-NEXT: [[TMP18:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
9494; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
9595; CHECK-NOI8MM-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
9696; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
97- ; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE ]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI ]], <vscale x 8 x i32> [[TMP20]])
98- ; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5 ]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1 ]], <vscale x 8 x i32> [[TMP21]])
97+ ; CHECK-NOI8MM-NEXT: [[TMP22 ]] = add <vscale x 8 x i32> [[TMP20 ]], [[VEC_PHI]]
98+ ; CHECK-NOI8MM-NEXT: [[TMP23 ]] = add <vscale x 8 x i32> [[TMP21 ]], [[VEC_PHI1]]
9999; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
100100; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
101101; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
102102; CHECK-NOI8MM: middle.block:
103- ; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[PARTIAL_REDUCE5 ]], [[PARTIAL_REDUCE ]]
104- ; CHECK-NOI8MM-NEXT: [[TMP23 :%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32 (<vscale x 2 x i32> [[BIN_RDX]])
103+ ; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP23 ]], [[TMP22 ]]
104+ ; CHECK-NOI8MM-NEXT: [[TMP25 :%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32 (<vscale x 8 x i32> [[BIN_RDX]])
105105; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
106106; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
107107; CHECK-NOI8MM: scalar.ph:
@@ -133,11 +133,11 @@ define i32 @dotp_s_z(ptr %a, ptr %b) #0 {
133133; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
134134; CHECK-NEXT: entry:
135135; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
136- ; CHECK-NEXT: [[TMP13 :%.*]] = mul i64 [[TMP0]], 16
136+ ; CHECK-NEXT: [[TMP1 :%.*]] = mul i64 [[TMP0]], 16
137137; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
138138; CHECK: vector.ph:
139- ; CHECK-NEXT: [[TMP14 :%.*]] = call i64 @llvm.vscale.i64()
140- ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP14 ]], 16
139+ ; CHECK-NEXT: [[TMP2 :%.*]] = call i64 @llvm.vscale.i64()
140+ ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2 ]], 16
141141; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
142142; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
143143; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
@@ -147,21 +147,21 @@ define i32 @dotp_s_z(ptr %a, ptr %b) #0 {
147147; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
148148; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
149149; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
150- ; CHECK-NEXT: [[TMP1 :%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
151- ; CHECK-NEXT: [[TMP2 :%.*]] = getelementptr i8, ptr [[TMP1 ]], i32 0
150+ ; CHECK-NEXT: [[TMP6 :%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
151+ ; CHECK-NEXT: [[TMP7 :%.*]] = getelementptr i8, ptr [[TMP6 ]], i32 0
152152; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
153153; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8
154- ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP1 ]], i64 [[TMP9]]
155- ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP2 ]], align 1
154+ ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6 ]], i64 [[TMP9]]
155+ ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7 ]], align 1
156156; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
157157; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
158158; CHECK-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
159- ; CHECK-NEXT: [[TMP6 :%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
160- ; CHECK-NEXT: [[TMP7 :%.*]] = getelementptr i8, ptr [[TMP6 ]], i32 0
159+ ; CHECK-NEXT: [[TMP13 :%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
160+ ; CHECK-NEXT: [[TMP14 :%.*]] = getelementptr i8, ptr [[TMP13 ]], i32 0
161161; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
162162; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8
163- ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP6 ]], i64 [[TMP16]]
164- ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7 ]], align 1
163+ ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13 ]], i64 [[TMP16]]
164+ ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14 ]], align 1
165165; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
166166; CHECK-NEXT: [[TMP18:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
167167; CHECK-NEXT: [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
@@ -183,48 +183,48 @@ define i32 @dotp_s_z(ptr %a, ptr %b) #0 {
183183; CHECK-NOI8MM-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
184184; CHECK-NOI8MM-NEXT: entry:
185185; CHECK-NOI8MM-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
186- ; CHECK-NOI8MM-NEXT: [[TMP13 :%.*]] = mul i64 [[TMP0]], 16
186+ ; CHECK-NOI8MM-NEXT: [[TMP1 :%.*]] = mul i64 [[TMP0]], 16
187187; CHECK-NOI8MM-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
188188; CHECK-NOI8MM: vector.ph:
189- ; CHECK-NOI8MM-NEXT: [[TMP14 :%.*]] = call i64 @llvm.vscale.i64()
190- ; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = mul i64 [[TMP14 ]], 16
189+ ; CHECK-NOI8MM-NEXT: [[TMP2 :%.*]] = call i64 @llvm.vscale.i64()
190+ ; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2 ]], 16
191191; CHECK-NOI8MM-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
192192; CHECK-NOI8MM-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
193193; CHECK-NOI8MM-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
194194; CHECK-NOI8MM-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
195195; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]]
196196; CHECK-NOI8MM: vector.body:
197197; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
198- ; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE :%.*]], [[VECTOR_BODY]] ]
199- ; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5 :%.*]], [[VECTOR_BODY]] ]
200- ; CHECK-NOI8MM-NEXT: [[TMP1 :%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
201- ; CHECK-NOI8MM-NEXT: [[TMP2 :%.*]] = getelementptr i8, ptr [[TMP1 ]], i32 0
198+ ; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22 :%.*]], [[VECTOR_BODY]] ]
199+ ; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23 :%.*]], [[VECTOR_BODY]] ]
200+ ; CHECK-NOI8MM-NEXT: [[TMP6 :%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
201+ ; CHECK-NOI8MM-NEXT: [[TMP7 :%.*]] = getelementptr i8, ptr [[TMP6 ]], i32 0
202202; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
203203; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8
204- ; CHECK-NOI8MM-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP1 ]], i64 [[TMP9]]
205- ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP2 ]], align 1
204+ ; CHECK-NOI8MM-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6 ]], i64 [[TMP9]]
205+ ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7 ]], align 1
206206; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
207207; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
208208; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
209- ; CHECK-NOI8MM-NEXT: [[TMP6 :%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
210- ; CHECK-NOI8MM-NEXT: [[TMP7 :%.*]] = getelementptr i8, ptr [[TMP6 ]], i32 0
209+ ; CHECK-NOI8MM-NEXT: [[TMP13 :%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
210+ ; CHECK-NOI8MM-NEXT: [[TMP14 :%.*]] = getelementptr i8, ptr [[TMP13 ]], i32 0
211211; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
212212; CHECK-NOI8MM-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8
213- ; CHECK-NOI8MM-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP6 ]], i64 [[TMP16]]
214- ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7 ]], align 1
213+ ; CHECK-NOI8MM-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13 ]], i64 [[TMP16]]
214+ ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14 ]], align 1
215215; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
216216; CHECK-NOI8MM-NEXT: [[TMP18:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
217217; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
218218; CHECK-NOI8MM-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
219219; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
220- ; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE ]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI ]], <vscale x 8 x i32> [[TMP20]])
221- ; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5 ]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1 ]], <vscale x 8 x i32> [[TMP21]])
220+ ; CHECK-NOI8MM-NEXT: [[TMP22 ]] = add <vscale x 8 x i32> [[TMP20 ]], [[VEC_PHI]]
221+ ; CHECK-NOI8MM-NEXT: [[TMP23 ]] = add <vscale x 8 x i32> [[TMP21 ]], [[VEC_PHI1]]
222222; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
223223; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
224224; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
225225; CHECK-NOI8MM: middle.block:
226- ; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[PARTIAL_REDUCE5 ]], [[PARTIAL_REDUCE ]]
227- ; CHECK-NOI8MM-NEXT: [[TMP23 :%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32 (<vscale x 2 x i32> [[BIN_RDX]])
226+ ; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP23 ]], [[TMP22 ]]
227+ ; CHECK-NOI8MM-NEXT: [[TMP25 :%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32 (<vscale x 8 x i32> [[BIN_RDX]])
228228; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
229229; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
230230; CHECK-NOI8MM: scalar.ph:
0 commit comments