@@ -12,9 +12,9 @@ target triple = "aarch64-unknown-linux-gnu"
12
12
; DEBUG: LV: Found maximum trip count: 19
13
13
; DEBUG: LV: IC is 1
14
14
; DEBUG-VS1: LV: VF is vscale x 16
15
- ; DEBUG-VS1: Main Loop VF:vscale x 16, Main Loop UF:1, Epilogue Loop VF:vscale x 8, Epilogue Loop UF:1
15
+ ; DEBUG-VS1: Main Loop VF:vscale x 16, Main Loop UF:1, Epilogue Loop VF:8, Epilogue Loop UF:1
16
16
; DEBUG-VS2: LV: VF is vscale x 8
17
- ; DEBUG-VS2: Main Loop VF:vscale x 8, Main Loop UF:1, Epilogue Loop VF:vscale x 4 , Epilogue Loop UF:1
17
+ ; DEBUG-VS2: Main Loop VF:vscale x 8, Main Loop UF:1, Epilogue Loop VF:8 , Epilogue Loop UF:1
18
18
19
19
; DEBUG-LABEL: LV: Checking a loop in 'trip_count_too_small'
20
20
; DEBUG: LV: Found a loop with a very small trip count. This loop is worth vectorizing only if no scalar iteration overheads are incurred.
@@ -48,9 +48,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
48
48
; CHECK-VS1-NEXT: [[TMP1:%.*]] = add i32 [[TC]], 1
49
49
; CHECK-VS1-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
50
50
; CHECK-VS1-NEXT: [[TMP3:%.*]] = sub i64 20, [[TMP2]]
51
- ; CHECK-VS1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
52
- ; CHECK-VS1-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3
53
- ; CHECK-VS1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]]
51
+ ; CHECK-VS1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8
54
52
; CHECK-VS1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
55
53
; CHECK-VS1: [[VECTOR_SCEVCHECK]]:
56
54
; CHECK-VS1-NEXT: [[TMP6:%.*]] = add i32 [[TC]], 1
@@ -91,28 +89,24 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
91
89
; CHECK-VS1: [[VEC_EPILOG_ITER_CHECK]]:
92
90
; CHECK-VS1-NEXT: [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]]
93
91
; CHECK-VS1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
94
- ; CHECK-VS1-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
95
- ; CHECK-VS1-NEXT: [[TMP27:%.*]] = shl nuw i64 [[TMP26]], 3
96
- ; CHECK-VS1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP27]]
92
+ ; CHECK-VS1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
97
93
; CHECK-VS1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
98
94
; CHECK-VS1: [[VEC_EPILOG_PH]]:
99
95
; CHECK-VS1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
100
- ; CHECK-VS1-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
101
- ; CHECK-VS1-NEXT: [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 8
102
- ; CHECK-VS1-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], [[TMP29]]
96
+ ; CHECK-VS1-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], 8
103
97
; CHECK-VS1-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
104
98
; CHECK-VS1-NEXT: [[TMP39:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
105
- ; CHECK-VS1-NEXT: [[BROADCAST_SPLATINSERT7 :%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[CONV]], i64 0
106
- ; CHECK-VS1-NEXT: [[BROADCAST_SPLAT8 :%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT7 ]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
99
+ ; CHECK-VS1-NEXT: [[BROADCAST_SPLATINSERT4 :%.*]] = insertelement <8 x i8> poison, i8 [[CONV]], i64 0
100
+ ; CHECK-VS1-NEXT: [[BROADCAST_SPLAT5 :%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT4 ]], <8 x i8> poison, <8 x i32> zeroinitializer
107
101
; CHECK-VS1-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
108
102
; CHECK-VS1: [[VEC_EPILOG_VECTOR_BODY]]:
109
103
; CHECK-VS1-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
110
104
; CHECK-VS1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX5]]
111
105
; CHECK-VS1-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[OFFSET_IDX]]
112
- ; CHECK-VS1-NEXT: [[WIDE_LOAD6 :%.*]] = load <vscale x 8 x i8>, ptr [[TMP33]], align 1
113
- ; CHECK-VS1-NEXT: [[TMP35 :%.*]] = add <vscale x 8 x i8> [[WIDE_LOAD6 ]], [[BROADCAST_SPLAT8 ]]
114
- ; CHECK-VS1-NEXT: store <vscale x 8 x i8> [[TMP35 ]], ptr [[TMP33]], align 1
115
- ; CHECK-VS1-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], [[TMP29]]
106
+ ; CHECK-VS1-NEXT: [[WIDE_LOAD7 :%.*]] = load <8 x i8>, ptr [[TMP33]], align 1
107
+ ; CHECK-VS1-NEXT: [[TMP23 :%.*]] = add <8 x i8> [[WIDE_LOAD7 ]], [[BROADCAST_SPLAT5 ]]
108
+ ; CHECK-VS1-NEXT: store <8 x i8> [[TMP23 ]], ptr [[TMP33]], align 1
109
+ ; CHECK-VS1-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], 8
116
110
; CHECK-VS1-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC3]]
117
111
; CHECK-VS1-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
118
112
; CHECK-VS1: [[VEC_EPILOG_MIDDLE_BLOCK]]:
@@ -148,9 +142,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
148
142
; CHECK-VS2-NEXT: [[TMP1:%.*]] = add i32 [[TC]], 1
149
143
; CHECK-VS2-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
150
144
; CHECK-VS2-NEXT: [[TMP3:%.*]] = sub i64 20, [[TMP2]]
151
- ; CHECK-VS2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
152
- ; CHECK-VS2-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
153
- ; CHECK-VS2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]]
145
+ ; CHECK-VS2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8
154
146
; CHECK-VS2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
155
147
; CHECK-VS2: [[VECTOR_SCEVCHECK]]:
156
148
; CHECK-VS2-NEXT: [[TMP6:%.*]] = add i32 [[TC]], 1
@@ -191,28 +183,24 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
191
183
; CHECK-VS2: [[VEC_EPILOG_ITER_CHECK]]:
192
184
; CHECK-VS2-NEXT: [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]]
193
185
; CHECK-VS2-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
194
- ; CHECK-VS2-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
195
- ; CHECK-VS2-NEXT: [[TMP27:%.*]] = shl nuw i64 [[TMP26]], 2
196
- ; CHECK-VS2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP27]]
186
+ ; CHECK-VS2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
197
187
; CHECK-VS2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
198
188
; CHECK-VS2: [[VEC_EPILOG_PH]]:
199
189
; CHECK-VS2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
200
- ; CHECK-VS2-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
201
- ; CHECK-VS2-NEXT: [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 4
202
- ; CHECK-VS2-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], [[TMP29]]
190
+ ; CHECK-VS2-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], 8
203
191
; CHECK-VS2-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
204
192
; CHECK-VS2-NEXT: [[TMP39:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
205
- ; CHECK-VS2-NEXT: [[BROADCAST_SPLATINSERT7 :%.*]] = insertelement <vscale x 4 x i8> poison, i8 [[CONV]], i64 0
206
- ; CHECK-VS2-NEXT: [[BROADCAST_SPLAT8 :%.*]] = shufflevector <vscale x 4 x i8> [[BROADCAST_SPLATINSERT7 ]], <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
193
+ ; CHECK-VS2-NEXT: [[BROADCAST_SPLATINSERT4 :%.*]] = insertelement <8 x i8> poison, i8 [[CONV]], i64 0
194
+ ; CHECK-VS2-NEXT: [[BROADCAST_SPLAT5 :%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT4 ]], <8 x i8> poison, <8 x i32> zeroinitializer
207
195
; CHECK-VS2-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
208
196
; CHECK-VS2: [[VEC_EPILOG_VECTOR_BODY]]:
209
197
; CHECK-VS2-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
210
198
; CHECK-VS2-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX5]]
211
199
; CHECK-VS2-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[OFFSET_IDX]]
212
- ; CHECK-VS2-NEXT: [[WIDE_LOAD6 :%.*]] = load <vscale x 4 x i8>, ptr [[TMP33]], align 1
213
- ; CHECK-VS2-NEXT: [[TMP35 :%.*]] = add <vscale x 4 x i8> [[WIDE_LOAD6 ]], [[BROADCAST_SPLAT8 ]]
214
- ; CHECK-VS2-NEXT: store <vscale x 4 x i8> [[TMP35 ]], ptr [[TMP33]], align 1
215
- ; CHECK-VS2-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], [[TMP29]]
200
+ ; CHECK-VS2-NEXT: [[WIDE_LOAD7 :%.*]] = load <8 x i8>, ptr [[TMP33]], align 1
201
+ ; CHECK-VS2-NEXT: [[TMP23 :%.*]] = add <8 x i8> [[WIDE_LOAD7 ]], [[BROADCAST_SPLAT5 ]]
202
+ ; CHECK-VS2-NEXT: store <8 x i8> [[TMP23 ]], ptr [[TMP33]], align 1
203
+ ; CHECK-VS2-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], 8
216
204
; CHECK-VS2-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC3]]
217
205
; CHECK-VS2-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
218
206
; CHECK-VS2: [[VEC_EPILOG_MIDDLE_BLOCK]]:
0 commit comments