Skip to content

Commit 8ea72b3

Browse files
author
Joe Ellis
committed
[clang][AArch64][SVE] Avoid going through memory for coerced VLST return values
VLST return values are coerced to VLATs in the function epilog for consistency with the VLAT ABI. Previously, this coercion was done through memory. It is preferable to use the llvm.experimental.vector.insert intrinsic to avoid going through memory here. Reviewed By: c-rhodes Differential Revision: https://reviews.llvm.org/D94290
1 parent c74751d commit 8ea72b3

File tree

5 files changed

+68
-101
lines changed

5 files changed

+68
-101
lines changed

clang/lib/CodeGen/CGCall.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1265,6 +1265,21 @@ static llvm::Value *CreateCoercedLoad(Address Src, llvm::Type *Ty,
12651265
return CGF.Builder.CreateLoad(Src);
12661266
}
12671267

1268+
// If coercing a fixed vector to a scalable vector for ABI compatibility, and
1269+
// the types match, use the llvm.experimental.vector.insert intrinsic to
1270+
// perform the conversion.
1271+
if (auto *ScalableDst = dyn_cast<llvm::ScalableVectorType>(Ty)) {
1272+
if (auto *FixedSrc = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
1273+
if (ScalableDst->getElementType() == FixedSrc->getElementType()) {
1274+
auto *Load = CGF.Builder.CreateLoad(Src);
1275+
auto *UndefVec = llvm::UndefValue::get(ScalableDst);
1276+
auto *Zero = llvm::Constant::getNullValue(CGF.CGM.Int64Ty);
1277+
return CGF.Builder.CreateInsertVector(ScalableDst, UndefVec, Load, Zero,
1278+
"castScalableSve");
1279+
}
1280+
}
1281+
}
1282+
12681283
// Otherwise do coercion through memory. This is stupid, but simple.
12691284
Address Tmp =
12701285
CreateTempAllocaForCoercion(CGF, Ty, Src.getAlignment(), Src.getName());

clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,11 @@ void test02() {
4848
// CHECK-SAME: [[#VBITS]]
4949
// CHECK-SAME: EES_(<vscale x 4 x i32> %x.coerce, <vscale x 4 x i32> %y.coerce)
5050
// CHECK-NEXT: entry:
51-
// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca <vscale x 4 x i32>, align 16
5251
// CHECK-NEXT: [[X:%.*]] = call <[[#div(VBITS, 32)]] x i32> @llvm.experimental.vector.extract.v[[#div(VBITS, 32)]]i32.nxv4i32(<vscale x 4 x i32> [[X_COERCE:%.*]], i64 0)
5352
// CHECK-NEXT: [[Y:%.*]] = call <[[#div(VBITS, 32)]] x i32> @llvm.experimental.vector.extract.v[[#div(VBITS, 32)]]i32.nxv4i32(<vscale x 4 x i32> [[X_COERCE1:%.*]], i64 0)
5453
// CHECK-NEXT: [[ADD:%.*]] = add <[[#div(VBITS, 32)]] x i32> [[Y]], [[X]]
55-
// CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast <vscale x 4 x i32>* [[RETVAL_COERCE]] to <[[#div(VBITS, 32)]] x i32>*
56-
// CHECK-NEXT: store <[[#div(VBITS, 32)]] x i32> [[ADD]], <[[#div(VBITS, 32)]] x i32>* [[RETVAL_0__SROA_CAST]], align 16
57-
// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[RETVAL_COERCE]], align 16
58-
// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
54+
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v[[#div(VBITS, 32)]]i32(<vscale x 4 x i32> undef, <[[#div(VBITS, 32)]] x i32> [[ADD]], i64 0)
55+
// CHECK-NEXT: ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
5956
typedef svint32_t vec __attribute__((arm_sve_vector_bits(N)));
6057
auto f(vec x, vec y) { return x + y; } // Returns a vec.
6158
#endif

clang/test/CodeGen/attr-arm-sve-vector-bits-call.c

Lines changed: 33 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -24,46 +24,37 @@ svint32_t sizeless_callee(svint32_t x) {
2424

2525
// CHECK-LABEL: @fixed_caller(
2626
// CHECK-NEXT: entry:
27-
// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca <vscale x 4 x i32>, align 16
2827
// CHECK-NEXT: [[X:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[X_COERCE:%.*]], i64 0)
2928
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[X]], i64 0)
3029
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[CASTSCALABLESVE]], i64 0)
31-
// CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast <vscale x 4 x i32>* [[RETVAL_COERCE]] to <16 x i32>*
32-
// CHECK-NEXT: store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16
33-
// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[RETVAL_COERCE]], align 16
34-
// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
30+
// CHECK-NEXT: [[CASTSCALABLESVE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[CASTFIXEDSVE]], i64 0)
31+
// CHECK-NEXT: ret <vscale x 4 x i32> [[CASTSCALABLESVE1]]
3532
//
3633
fixed_int32_t fixed_caller(fixed_int32_t x) {
3734
return sizeless_callee(x);
3835
}
3936

4037
// CHECK-LABEL: @fixed_callee(
4138
// CHECK-NEXT: entry:
42-
// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca <vscale x 4 x i32>, align 16
4339
// CHECK-NEXT: [[X:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[X_COERCE:%.*]], i64 0)
44-
// CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast <vscale x 4 x i32>* [[RETVAL_COERCE]] to <16 x i32>*
45-
// CHECK-NEXT: store <16 x i32> [[X]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16
46-
// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[RETVAL_COERCE]], align 16
47-
// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
40+
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[X]], i64 0)
41+
// CHECK-NEXT: ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
4842
//
4943
fixed_int32_t fixed_callee(fixed_int32_t x) {
5044
return x;
5145
}
5246

5347
// CHECK-LABEL: @sizeless_caller(
5448
// CHECK-NEXT: entry:
55-
// CHECK-NEXT: [[COERCE_COERCE:%.*]] = alloca <vscale x 4 x i32>, align 16
5649
// CHECK-NEXT: [[COERCE1:%.*]] = alloca <16 x i32>, align 16
5750
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[X:%.*]], i64 0)
58-
// CHECK-NEXT: [[COERCE_0__SROA_CAST:%.*]] = bitcast <vscale x 4 x i32>* [[COERCE_COERCE]] to <16 x i32>*
59-
// CHECK-NEXT: store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[COERCE_0__SROA_CAST]], align 16
60-
// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[COERCE_COERCE]], align 16
61-
// CHECK-NEXT: [[CALL:%.*]] = call <vscale x 4 x i32> @fixed_callee(<vscale x 4 x i32> [[TMP0]])
62-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[COERCE1]] to <vscale x 4 x i32>*
63-
// CHECK-NEXT: store <vscale x 4 x i32> [[CALL]], <vscale x 4 x i32>* [[TMP1]], align 16
64-
// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* [[COERCE1]], align 16, [[TBAA6:!tbaa !.*]]
65-
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[TMP2]], i64 0)
66-
// CHECK-NEXT: ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
51+
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[CASTFIXEDSVE]], i64 0)
52+
// CHECK-NEXT: [[CALL:%.*]] = call <vscale x 4 x i32> @fixed_callee(<vscale x 4 x i32> [[CASTSCALABLESVE]])
53+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i32>* [[COERCE1]] to <vscale x 4 x i32>*
54+
// CHECK-NEXT: store <vscale x 4 x i32> [[CALL]], <vscale x 4 x i32>* [[TMP0]], align 16
55+
// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[COERCE1]], align 16, [[TBAA6:!tbaa !.*]]
56+
// CHECK-NEXT: [[CASTSCALABLESVE2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[TMP1]], i64 0)
57+
// CHECK-NEXT: ret <vscale x 4 x i32> [[CASTSCALABLESVE2]]
6758
//
6859
svint32_t sizeless_caller(svint32_t x) {
6960
return fixed_callee(x);
@@ -75,37 +66,31 @@ svint32_t sizeless_caller(svint32_t x) {
7566

7667
// CHECK-LABEL: @call_int32_ff(
7768
// CHECK-NEXT: entry:
78-
// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca <vscale x 4 x i32>, align 16
79-
// CHECK-NEXT: [[OP1:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[X_COERCE:%.*]], i64 0)
80-
// CHECK-NEXT: [[OP2:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[X_COERCE1:%.*]], i64 0)
69+
// CHECK-NEXT: [[OP1:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[OP1_COERCE:%.*]], i64 0)
70+
// CHECK-NEXT: [[OP2:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[OP2_COERCE:%.*]], i64 0)
8171
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[OP1]], i64 0)
82-
// CHECK-NEXT: [[CASTSCALABLESVE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[OP2]], i64 0)
72+
// CHECK-NEXT: [[CASTSCALABLESVE2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[OP2]], i64 0)
8373
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
84-
// CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sel.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[CASTSCALABLESVE]], <vscale x 4 x i32> [[CASTSCALABLESVE3]])
74+
// CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sel.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[CASTSCALABLESVE]], <vscale x 4 x i32> [[CASTSCALABLESVE2]])
8575
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[TMP1]], i64 0)
86-
// CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast <vscale x 4 x i32>* [[RETVAL_COERCE]] to <16 x i32>*
87-
// CHECK-NEXT: store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16
88-
// CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[RETVAL_COERCE]], align 16
89-
// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
76+
// CHECK-NEXT: [[CASTSCALABLESVE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[CASTFIXEDSVE]], i64 0)
77+
// CHECK-NEXT: ret <vscale x 4 x i32> [[CASTSCALABLESVE3]]
9078
//
9179
fixed_int32_t call_int32_ff(svbool_t pg, fixed_int32_t op1, fixed_int32_t op2) {
9280
return svsel(pg, op1, op2);
9381
}
9482

9583
// CHECK-LABEL: @call_float64_ff(
9684
// CHECK-NEXT: entry:
97-
// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x double>, align 16
98-
// CHECK-NEXT: [[OP1:%.*]] = call <8 x double> @llvm.experimental.vector.extract.v8f64.nxv2f64(<vscale x 2 x double> [[X_COERCE:%.*]], i64 0)
99-
// CHECK-NEXT: [[OP2:%.*]] = call <8 x double> @llvm.experimental.vector.extract.v8f64.nxv2f64(<vscale x 2 x double> [[X_COERCE1:%.*]], i64 0)
85+
// CHECK-NEXT: [[OP1:%.*]] = call <8 x double> @llvm.experimental.vector.extract.v8f64.nxv2f64(<vscale x 2 x double> [[OP1_COERCE:%.*]], i64 0)
86+
// CHECK-NEXT: [[OP2:%.*]] = call <8 x double> @llvm.experimental.vector.extract.v8f64.nxv2f64(<vscale x 2 x double> [[OP2_COERCE:%.*]], i64 0)
10087
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 2 x double> @llvm.experimental.vector.insert.nxv2f64.v8f64(<vscale x 2 x double> undef, <8 x double> [[OP1]], i64 0)
101-
// CHECK-NEXT: [[CASTSCALABLESVE3:%.*]] = call <vscale x 2 x double> @llvm.experimental.vector.insert.nxv2f64.v8f64(<vscale x 2 x double> undef, <8 x double> [[OP2]], i64 0)
88+
// CHECK-NEXT: [[CASTSCALABLESVE2:%.*]] = call <vscale x 2 x double> @llvm.experimental.vector.insert.nxv2f64.v8f64(<vscale x 2 x double> undef, <8 x double> [[OP2]], i64 0)
10289
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
103-
// CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.sel.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[CASTSCALABLESVE]], <vscale x 2 x double> [[CASTSCALABLESVE3]])
90+
// CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.sel.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[CASTSCALABLESVE]], <vscale x 2 x double> [[CASTSCALABLESVE2]])
10491
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <8 x double> @llvm.experimental.vector.extract.v8f64.nxv2f64(<vscale x 2 x double> [[TMP1]], i64 0)
105-
// CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast <vscale x 2 x double>* [[RETVAL_COERCE]] to <8 x double>*
106-
// CHECK-NEXT: store <8 x double> [[CASTFIXEDSVE]], <8 x double>* [[RETVAL_0__SROA_CAST]], align 16
107-
// CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* [[RETVAL_COERCE]], align 16
108-
// CHECK-NEXT: ret <vscale x 2 x double> [[TMP2]]
92+
// CHECK-NEXT: [[CASTSCALABLESVE3:%.*]] = call <vscale x 2 x double> @llvm.experimental.vector.insert.nxv2f64.v8f64(<vscale x 2 x double> undef, <8 x double> [[CASTFIXEDSVE]], i64 0)
93+
// CHECK-NEXT: ret <vscale x 2 x double> [[CASTSCALABLESVE3]]
10994
//
11095
fixed_float64_t call_float64_ff(svbool_t pg, fixed_float64_t op1, fixed_float64_t op2) {
11196
return svsel(pg, op1, op2);
@@ -150,33 +135,27 @@ fixed_bool_t call_bool_ff(svbool_t pg, fixed_bool_t op1, fixed_bool_t op2) {
150135

151136
// CHECK-LABEL: @call_int32_fs(
152137
// CHECK-NEXT: entry:
153-
// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca <vscale x 4 x i32>, align 16
154-
// CHECK-NEXT: [[OP1:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[X_COERCE:%.*]], i64 0)
138+
// CHECK-NEXT: [[OP1:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[OP1_COERCE:%.*]], i64 0)
155139
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[OP1]], i64 0)
156140
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
157141
// CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sel.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[CASTSCALABLESVE]], <vscale x 4 x i32> [[OP2:%.*]])
158142
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[TMP1]], i64 0)
159-
// CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast <vscale x 4 x i32>* [[RETVAL_COERCE]] to <16 x i32>*
160-
// CHECK-NEXT: store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16
161-
// CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[RETVAL_COERCE]], align 16
162-
// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
143+
// CHECK-NEXT: [[CASTSCALABLESVE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[CASTFIXEDSVE]], i64 0)
144+
// CHECK-NEXT: ret <vscale x 4 x i32> [[CASTSCALABLESVE1]]
163145
//
164146
fixed_int32_t call_int32_fs(svbool_t pg, fixed_int32_t op1, svint32_t op2) {
165147
return svsel(pg, op1, op2);
166148
}
167149

168150
// CHECK-LABEL: @call_float64_fs(
169151
// CHECK-NEXT: entry:
170-
// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x double>, align 16
171-
// CHECK-NEXT: [[OP1:%.*]] = call <8 x double> @llvm.experimental.vector.extract.v8f64.nxv2f64(<vscale x 2 x double> [[X_COERCE:%.*]], i64 0)
152+
// CHECK-NEXT: [[OP1:%.*]] = call <8 x double> @llvm.experimental.vector.extract.v8f64.nxv2f64(<vscale x 2 x double> [[OP1_COERCE:%.*]], i64 0)
172153
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 2 x double> @llvm.experimental.vector.insert.nxv2f64.v8f64(<vscale x 2 x double> undef, <8 x double> [[OP1]], i64 0)
173154
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
174155
// CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.sel.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[CASTSCALABLESVE]], <vscale x 2 x double> [[OP2:%.*]])
175156
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <8 x double> @llvm.experimental.vector.extract.v8f64.nxv2f64(<vscale x 2 x double> [[TMP1]], i64 0)
176-
// CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast <vscale x 2 x double>* [[RETVAL_COERCE]] to <8 x double>*
177-
// CHECK-NEXT: store <8 x double> [[CASTFIXEDSVE]], <8 x double>* [[RETVAL_0__SROA_CAST]], align 16
178-
// CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* [[RETVAL_COERCE]], align 16
179-
// CHECK-NEXT: ret <vscale x 2 x double> [[TMP2]]
157+
// CHECK-NEXT: [[CASTSCALABLESVE1:%.*]] = call <vscale x 2 x double> @llvm.experimental.vector.insert.nxv2f64.v8f64(<vscale x 2 x double> undef, <8 x double> [[CASTFIXEDSVE]], i64 0)
158+
// CHECK-NEXT: ret <vscale x 2 x double> [[CASTSCALABLESVE1]]
180159
//
181160
fixed_float64_t call_float64_fs(svbool_t pg, fixed_float64_t op1, svfloat64_t op2) {
182161
return svsel(pg, op1, op2);
@@ -213,29 +192,23 @@ fixed_bool_t call_bool_fs(svbool_t pg, fixed_bool_t op1, svbool_t op2) {
213192

214193
// CHECK-LABEL: @call_int32_ss(
215194
// CHECK-NEXT: entry:
216-
// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca <vscale x 4 x i32>, align 16
217195
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
218196
// CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sel.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]])
219197
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[TMP1]], i64 0)
220-
// CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast <vscale x 4 x i32>* [[RETVAL_COERCE]] to <16 x i32>*
221-
// CHECK-NEXT: store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[RETVAL_0__SROA_CAST]], align 16
222-
// CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[RETVAL_COERCE]], align 16
223-
// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
198+
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[CASTFIXEDSVE]], i64 0)
199+
// CHECK-NEXT: ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
224200
//
225201
fixed_int32_t call_int32_ss(svbool_t pg, svint32_t op1, svint32_t op2) {
226202
return svsel(pg, op1, op2);
227203
}
228204

229205
// CHECK-LABEL: @call_float64_ss(
230206
// CHECK-NEXT: entry:
231-
// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x double>, align 16
232207
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
233208
// CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.sel.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]])
234209
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <8 x double> @llvm.experimental.vector.extract.v8f64.nxv2f64(<vscale x 2 x double> [[TMP1]], i64 0)
235-
// CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast <vscale x 2 x double>* [[RETVAL_COERCE]] to <8 x double>*
236-
// CHECK-NEXT: store <8 x double> [[CASTFIXEDSVE]], <8 x double>* [[RETVAL_0__SROA_CAST]], align 16
237-
// CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* [[RETVAL_COERCE]], align 16
238-
// CHECK-NEXT: ret <vscale x 2 x double> [[TMP2]]
210+
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 2 x double> @llvm.experimental.vector.insert.nxv2f64.v8f64(<vscale x 2 x double> undef, <8 x double> [[CASTFIXEDSVE]], i64 0)
211+
// CHECK-NEXT: ret <vscale x 2 x double> [[CASTSCALABLESVE]]
239212
//
240213
fixed_float64_t call_float64_ss(svbool_t pg, svfloat64_t op1, svfloat64_t op2) {
241214
return svsel(pg, op1, op2);

0 commit comments

Comments
 (0)