Skip to content

Commit 76ea734

Browse files
committed
improvements on arm
1 parent bab7209 commit 76ea734

File tree

2 files changed

+143
-192
lines changed

2 files changed

+143
-192
lines changed

clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c

Lines changed: 38 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,8 @@ float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
111111
// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
112112
// CHECK-NEXT: [[ENTRY:.*:]]
113113
// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[A]], i32 0
114-
// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <1 x double> [[B]], i32 0
115-
// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE4]])
114+
// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <1 x double> [[B]], i32 0
115+
// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE3]])
116116
// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[A]], double [[VMULXD_F64_I]], i32 0
117117
// CHECK-NEXT: ret <1 x double> [[VSET_LANE]]
118118
//
@@ -196,19 +196,13 @@ float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
196196
// CHECK-LABEL: define dso_local <1 x double> @test_vfma_lane_f64(
197197
// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
198198
// CHECK-NEXT: [[ENTRY:.*:]]
199-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
200-
// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
201-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
202-
// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
203-
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64
204-
// CHECK-NEXT: [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
205-
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
206-
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
207-
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8>
208-
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
209-
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer
210-
// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
211-
// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
199+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
200+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to <8 x i8>
201+
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to <8 x i8>
202+
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
203+
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
204+
// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
205+
// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
212206
// CHECK-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
213207
// CHECK-NEXT: ret <1 x double> [[FMLA2]]
214208
//
@@ -219,20 +213,14 @@ float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
219213
// CHECK-LABEL: define dso_local <1 x double> @test_vfms_lane_f64(
220214
// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
221215
// CHECK-NEXT: [[ENTRY:.*:]]
222-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
223-
// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
224216
// CHECK-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]]
225-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64
226-
// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
227-
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64
228-
// CHECK-NEXT: [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
229-
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
230-
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
231-
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8>
232-
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
233-
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer
234-
// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
235-
// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
217+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
218+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to <8 x i8>
219+
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to <8 x i8>
220+
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
221+
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
222+
// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
223+
// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
236224
// CHECK-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
237225
// CHECK-NEXT: ret <1 x double> [[FMLA2]]
238226
//
@@ -243,21 +231,16 @@ float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
243231
// CHECK-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64(
244232
// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
245233
// CHECK-NEXT: [[ENTRY:.*:]]
246-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
247-
// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
248-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
249-
// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
250-
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
251-
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
252-
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
253-
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
254-
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double
255-
// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double
256-
// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
257-
// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
258-
// CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]])
259-
// CHECK-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double>
260-
// CHECK-NEXT: ret <1 x double> [[TMP10]]
234+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
235+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to <8 x i8>
236+
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <16 x i8>
237+
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
238+
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
239+
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
240+
// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
241+
// CHECK-NEXT: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
242+
// CHECK-NEXT: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
243+
// CHECK-NEXT: ret <1 x double> [[TMP7]]
261244
//
262245
float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
263246
return vfma_laneq_f64(a, b, v, 0);
@@ -266,22 +249,17 @@ float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
266249
// CHECK-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64(
267250
// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
268251
// CHECK-NEXT: [[ENTRY:.*:]]
269-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
270-
// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
271252
// CHECK-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]]
272-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64
273-
// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
274-
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
275-
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
276-
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
277-
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
278-
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double
279-
// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double
280-
// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
281-
// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
282-
// CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]])
283-
// CHECK-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double>
284-
// CHECK-NEXT: ret <1 x double> [[TMP10]]
253+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
254+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to <8 x i8>
255+
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <16 x i8>
256+
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
257+
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
258+
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
259+
// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
260+
// CHECK-NEXT: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
261+
// CHECK-NEXT: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
262+
// CHECK-NEXT: ret <1 x double> [[TMP7]]
285263
//
286264
float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
287265
return vfms_laneq_f64(a, b, v, 0);
@@ -555,8 +533,8 @@ int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
555533
// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
556534
// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
557535
// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
558-
// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
559-
// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE9]])
536+
// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
537+
// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE8]])
560538
// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
561539
// CHECK-NEXT: ret <1 x double> [[VSET_LANE]]
562540
//

0 commit comments

Comments
 (0)