From 9dede331a3a8b60dc2b93ae684f1c5b6154e4a2d Mon Sep 17 00:00:00 2001 From: Paul Osmialowski Date: Mon, 23 Jun 2025 16:11:57 +0000 Subject: [PATCH 1/2] Revert "[Downstream change][LLVM][SROA] Teach SROA how to "bitcast" between fixed and scalable vectors (#219)" This reverts commit e99c0aaa74750ab510704ed8922c577e7d270fa4. --- .../attr-riscv-rvv-vector-bits-less-8-call.c | 38 +-- .../attr-riscv-rvv-vector-bits-less-8-cast.c | 8 +- .../CodeGen/RISCV/attr-rvv-vector-bits-cast.c | 16 +- .../CodeGen/attr-arm-sve-vector-bits-cast.c | 23 +- llvm/include/llvm/IR/Attributes.h | 4 - llvm/include/llvm/IR/DerivedTypes.h | 16 -- llvm/lib/IR/AttributeImpl.h | 1 - llvm/lib/IR/Attributes.cpp | 8 - llvm/lib/Transforms/Scalar/SROA.cpp | 130 +++------ .../scalable-vectors-with-known-vscale.ll | 248 ------------------ llvm/test/Transforms/SROA/scalable-vectors.ll | 142 ---------- 11 files changed, 71 insertions(+), 563 deletions(-) delete mode 100644 llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll diff --git a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c index 66fd466eccfe..e2f02dc64f76 100644 --- a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c +++ b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c @@ -26,15 +26,11 @@ typedef vbool64_t fixed_bool64_t __attribute__((riscv_rvv_vector_bits(__riscv_v_ // // CHECK-128-LABEL: @call_bool32_ff( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1 -// CHECK-128-NEXT: [[SAVED_VALUE3:%.*]] = alloca <1 x i8>, align 1 // CHECK-128-NEXT: [[SAVED_VALUE4:%.*]] = alloca , align 1 // CHECK-128-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-128-NEXT: [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_:%.*]] = load , ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6:![0-9]+]] -// CHECK-128-NEXT: [[SAVED_VALUE3_0_SAVED_VALUE3_0_SAVED_VALUE3_0_SAVED_VALUE3_0_:%.*]] = load , ptr [[SAVED_VALUE3]], align 1, !tbaa [[TBAA6]] -// CHECK-128-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv2i1.i64( [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_]], [[SAVED_VALUE3_0_SAVED_VALUE3_0_SAVED_VALUE3_0_SAVED_VALUE3_0_]], i64 4) -// CHECK-128-NEXT: store [[TMP0]], ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA9:![0-9]+]] -// CHECK-128-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA6]] +// CHECK-128-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv2i1.i64( [[OP1_COERCE:%.*]], [[OP2_COERCE:%.*]], i64 4) +// CHECK-128-NEXT: store [[TMP0]], ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA6:![0-9]+]] +// CHECK-128-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA10:![0-9]+]] // CHECK-128-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1 // CHECK-128-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 // CHECK-128-NEXT: ret [[TMP2]] @@ -56,15 +52,11 @@ fixed_bool32_t call_bool32_ff(fixed_bool32_t op1, fixed_bool32_t op2) { // // CHECK-128-LABEL: @call_bool64_ff( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1 -// CHECK-128-NEXT: [[SAVED_VALUE3:%.*]] = alloca <1 x i8>, align 1 // CHECK-128-NEXT: [[SAVED_VALUE4:%.*]] = alloca , align 1 // CHECK-128-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-128-NEXT: [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_:%.*]] = load , ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]] -// CHECK-128-NEXT: [[SAVED_VALUE3_0_SAVED_VALUE3_0_SAVED_VALUE3_0_SAVED_VALUE3_0_:%.*]] = load , ptr [[SAVED_VALUE3]], align 1, !tbaa [[TBAA6]] -// CHECK-128-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv1i1.i64( [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_]], [[SAVED_VALUE3_0_SAVED_VALUE3_0_SAVED_VALUE3_0_SAVED_VALUE3_0_]], i64 2) +// CHECK-128-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv1i1.i64( [[OP1_COERCE:%.*]], [[OP2_COERCE:%.*]], i64 2) // CHECK-128-NEXT: store [[TMP0]], ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA11:![0-9]+]] -// CHECK-128-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA6]] +// CHECK-128-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA10]] // CHECK-128-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1 // CHECK-128-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 // CHECK-128-NEXT: ret [[TMP2]] @@ -90,13 +82,11 @@ fixed_bool64_t call_bool64_ff(fixed_bool64_t op1, fixed_bool64_t op2) { // // CHECK-128-LABEL: @call_bool32_fs( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1 // CHECK-128-NEXT: [[SAVED_VALUE2:%.*]] = alloca , align 1 // CHECK-128-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-128-NEXT: [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_:%.*]] = load , ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]] -// CHECK-128-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv2i1.i64( [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_]], [[OP2:%.*]], i64 4) -// CHECK-128-NEXT: store [[TMP0]], ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA9]] -// CHECK-128-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA6]] +// CHECK-128-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv2i1.i64( [[OP1_COERCE:%.*]], [[OP2:%.*]], i64 4) +// CHECK-128-NEXT: store [[TMP0]], ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA6]] +// CHECK-128-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA10]] // CHECK-128-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1 // CHECK-128-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 // CHECK-128-NEXT: ret [[TMP2]] @@ -118,13 +108,11 @@ fixed_bool32_t call_bool32_fs(fixed_bool32_t op1, vbool32_t op2) { // // CHECK-128-LABEL: @call_bool64_fs( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1 // CHECK-128-NEXT: [[SAVED_VALUE2:%.*]] = alloca , align 1 // CHECK-128-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-128-NEXT: [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_:%.*]] = load , ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]] -// CHECK-128-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv1i1.i64( [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_]], [[OP2:%.*]], i64 2) +// CHECK-128-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv1i1.i64( [[OP1_COERCE:%.*]], [[OP2:%.*]], i64 2) // CHECK-128-NEXT: store [[TMP0]], ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA11]] -// CHECK-128-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA6]] +// CHECK-128-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA10]] // CHECK-128-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1 // CHECK-128-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 // CHECK-128-NEXT: ret [[TMP2]] @@ -153,8 +141,8 @@ fixed_bool64_t call_bool64_fs(fixed_bool64_t op1, vbool64_t op2) { // CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 1 // CHECK-128-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 // CHECK-128-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv2i1.i64( [[OP1:%.*]], [[OP2:%.*]], i64 4) -// CHECK-128-NEXT: store [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA9]] -// CHECK-128-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]] +// CHECK-128-NEXT: store [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]] +// CHECK-128-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]] // CHECK-128-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1 // CHECK-128-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 // CHECK-128-NEXT: ret [[TMP2]] @@ -180,7 +168,7 @@ fixed_bool32_t call_bool32_ss(vbool32_t op1, vbool32_t op2) { // CHECK-128-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 // CHECK-128-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv1i1.i64( [[OP1:%.*]], [[OP2:%.*]], i64 2) // CHECK-128-NEXT: store [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA11]] -// CHECK-128-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]] +// CHECK-128-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]] // CHECK-128-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1 // CHECK-128-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 // CHECK-128-NEXT: ret [[TMP2]] diff --git a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c index 3785036380f5..f0fa7e8d07b4 100644 --- a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c +++ b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c @@ -80,9 +80,7 @@ fixed_bool32_t from_vbool32_t(vbool32_t type) { // // CHECK-128-LABEL: @to_vbool32_t( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1 -// CHECK-128-NEXT: [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_:%.*]] = load , ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]] -// CHECK-128-NEXT: ret [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_]] +// CHECK-128-NEXT: ret [[TYPE_COERCE:%.*]] // vbool32_t to_vbool32_t(fixed_bool32_t type) { return type; @@ -118,9 +116,7 @@ fixed_bool64_t from_vbool64_t(vbool64_t type) { // // CHECK-128-LABEL: @to_vbool64_t( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1 -// CHECK-128-NEXT: [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_:%.*]] = load , ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]] -// CHECK-128-NEXT: ret [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_]] +// CHECK-128-NEXT: ret [[TYPE_COERCE:%.*]] // vbool64_t to_vbool64_t(fixed_bool64_t type) { return type; diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c index 8764616eef23..7992951346d5 100644 --- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c +++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c @@ -99,8 +99,8 @@ vbool4_t to_vbool4_t(fixed_bool4_t type) { // CHECK-NEXT: entry: // CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 1 // CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-NEXT: store [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6:![0-9]+]] -// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10:![0-9]+]] +// CHECK-NEXT: store [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA8:![0-9]+]] // CHECK-NEXT: store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1 // CHECK-NEXT: [[TMP1:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 // CHECK-NEXT: ret [[TMP1]] @@ -111,9 +111,7 @@ fixed_bool32_t from_vbool32_t(vbool32_t type) { // CHECK-LABEL: @to_vbool32_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1 -// CHECK-NEXT: [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_:%.*]] = load , ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]] -// CHECK-NEXT: ret [[SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_SAVED_VALUE_0_]] +// CHECK-NEXT: ret [[TYPE_COERCE:%.*]] // vbool32_t to_vbool32_t(fixed_bool32_t type) { return type; @@ -121,7 +119,7 @@ vbool32_t to_vbool32_t(fixed_bool32_t type) { // CHECK-LABEL: @to_vint32m1_t__from_gnu_int32m1_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA10]] +// CHECK-NEXT: [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA8]] // CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv2i32.v8i32( poison, <8 x i32> [[TYPE]], i64 0) // CHECK-NEXT: ret [[CAST_SCALABLE]] // @@ -132,7 +130,7 @@ vint32m1_t to_vint32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) { // CHECK-LABEL: @from_vint32m1_t__to_gnu_int32m1_t( // CHECK-NEXT: entry: // CHECK-NEXT: [[CAST_FIXED:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32( [[TYPE:%.*]], i64 0) -// CHECK-NEXT: store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA10]] +// CHECK-NEXT: store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) { @@ -141,7 +139,7 @@ gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) { // CHECK-LABEL: @to_fixed_int32m1_t__from_gnu_int32m1_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA10]] +// CHECK-NEXT: [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA8]] // CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv2i32.v8i32( poison, <8 x i32> [[TYPE]], i64 0) // CHECK-NEXT: ret [[CAST_SCALABLE]] // @@ -152,7 +150,7 @@ fixed_int32m1_t to_fixed_int32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) { // CHECK-LABEL: @from_fixed_int32m1_t__to_gnu_int32m1_t( // CHECK-NEXT: entry: // CHECK-NEXT: [[TYPE:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32( [[TYPE_COERCE:%.*]], i64 0) -// CHECK-NEXT: store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA10]] +// CHECK-NEXT: store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // gnu_int32m1_t from_fixed_int32m1_t__to_gnu_int32m1_t(fixed_int32m1_t type) { diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c index fcd4314249ff..e1e2220f94d6 100644 --- a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c +++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c @@ -62,7 +62,10 @@ fixed_bool_t from_svbool_t(svbool_t type) { // CHECK-LABEL: @lax_cast( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[TYPE_COERCE:%.*]] to +// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca <16 x i32>, align 64 +// CHECK-NEXT: [[TYPE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( [[TYPE_COERCE:%.*]], i64 0) +// CHECK-NEXT: store <16 x i32> [[TYPE]], ptr [[SAVED_VALUE]], align 64, !tbaa [[TBAA6:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load , ptr [[SAVED_VALUE]], align 64, !tbaa [[TBAA6]] // CHECK-NEXT: ret [[TMP0]] // svint64_t lax_cast(fixed_int32_t type) { @@ -71,9 +74,9 @@ svint64_t lax_cast(fixed_int32_t type) { // CHECK-LABEL: @to_svint32_t__from_gnu_int32_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA2:![0-9]+]] -// CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> [[TYPE]], i64 0) -// CHECK-NEXT: ret [[CAST_SCALABLE]] +// CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA6]] +// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> [[TYPE]], i64 0) +// CHECK-NEXT: ret [[CASTSCALABLESVE]] // svint32_t to_svint32_t__from_gnu_int32_t(gnu_int32_t type) { return type; @@ -81,8 +84,8 @@ svint32_t to_svint32_t__from_gnu_int32_t(gnu_int32_t type) { // CHECK-LABEL: @from_svint32_t__to_gnu_int32_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[CAST_FIXED:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( [[TYPE:%.*]], i64 0) -// CHECK-NEXT: store <16 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA2]] +// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( [[TYPE:%.*]], i64 0) +// CHECK-NEXT: store <16 x i32> [[CASTFIXEDSVE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA6]] // CHECK-NEXT: ret void // gnu_int32_t from_svint32_t__to_gnu_int32_t(svint32_t type) { @@ -91,9 +94,9 @@ gnu_int32_t from_svint32_t__to_gnu_int32_t(svint32_t type) { // CHECK-LABEL: @to_fixed_int32_t__from_gnu_int32_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA2]] -// CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> [[TYPE]], i64 0) -// CHECK-NEXT: ret [[CAST_SCALABLE]] +// CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA6]] +// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> [[TYPE]], i64 0) +// CHECK-NEXT: ret [[CASTSCALABLESVE]] // fixed_int32_t to_fixed_int32_t__from_gnu_int32_t(gnu_int32_t type) { return type; @@ -102,7 +105,7 @@ fixed_int32_t to_fixed_int32_t__from_gnu_int32_t(gnu_int32_t type) { // CHECK-LABEL: @from_fixed_int32_t__to_gnu_int32_t( // CHECK-NEXT: entry: // CHECK-NEXT: [[TYPE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( [[TYPE_COERCE:%.*]], i64 0) -// CHECK-NEXT: store <16 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA2]] +// CHECK-NEXT: store <16 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA6]] // CHECK-NEXT: ret void // gnu_int32_t from_fixed_int32_t__to_gnu_int32_t(fixed_int32_t type) { diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h index 610f3cfca41c..7612e553fe32 100644 --- a/llvm/include/llvm/IR/Attributes.h +++ b/llvm/include/llvm/IR/Attributes.h @@ -275,10 +275,6 @@ class Attribute { /// when unknown. std::optional getVScaleRangeMax() const; - /// Return the value for vscale based on the vscale_range attribute or 0 when - /// unknown. - unsigned getVScaleValue() const; - // Returns the unwind table kind. UWTableKind getUWTableKind() const; diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h index 15370cb0c95c..b44f4f8c8687 100644 --- a/llvm/include/llvm/IR/DerivedTypes.h +++ b/llvm/include/llvm/IR/DerivedTypes.h @@ -545,22 +545,6 @@ class VectorType : public Type { return VectorType::get(VTy->getElementType(), EltCnt * 2); } - /// This static method returns a VectorType with the same size-in-bits as - /// SizeTy but with an element type that matches the scalar type of EltTy. - static VectorType *getWithSizeAndScalar(VectorType *SizeTy, Type *EltTy) { - if (SizeTy->getScalarType() == EltTy->getScalarType()) - return SizeTy; - - unsigned EltSize = EltTy->getScalarSizeInBits(); - if (!SizeTy->getPrimitiveSizeInBits().isKnownMultipleOf(EltSize)) - return nullptr; - - ElementCount EC = SizeTy->getElementCount() - .multiplyCoefficientBy(SizeTy->getScalarSizeInBits()) - .divideCoefficientBy(EltSize); - return VectorType::get(EltTy->getScalarType(), EC); - } - /// Return true if the specified type is valid as a element type. static bool isValidElementType(Type *ElemTy); diff --git a/llvm/lib/IR/AttributeImpl.h b/llvm/lib/IR/AttributeImpl.h index 42a5ab4c58f9..59cc489ade40 100644 --- a/llvm/lib/IR/AttributeImpl.h +++ b/llvm/lib/IR/AttributeImpl.h @@ -343,7 +343,6 @@ class AttributeSetNode final const; unsigned getVScaleRangeMin() const; std::optional getVScaleRangeMax() const; - unsigned getVScaleValue() const; UWTableKind getUWTableKind() const; AllocFnKind getAllocKind() const; MemoryEffects getMemoryEffects() const; diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index 7c3a554ba03f..ceb31856283c 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -469,14 +469,6 @@ std::optional Attribute::getVScaleRangeMax() const { return unpackVScaleRangeArgs(pImpl->getValueAsInt()).second; } -unsigned Attribute::getVScaleValue() const { - std::optional VScale = getVScaleRangeMax(); - if (VScale && *VScale == getVScaleRangeMin()) - return *VScale; - - return 0; -} - UWTableKind Attribute::getUWTableKind() const { assert(hasAttribute(Attribute::UWTable) && "Trying to get unwind table kind from non-uwtable attribute"); diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index e88bf1ca4596..e88c130cccf2 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -1118,14 +1118,8 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor { return PI.setAborted(&LI); TypeSize Size = DL.getTypeStoreSize(LI.getType()); - if (Size.isScalable()) { - Attribute Attr = LI.getFunction()->getFnAttribute(Attribute::VScaleRange); - unsigned VScale = Attr.isValid() ? Attr.getVScaleValue() : 0; - if (!VScale) - return PI.setAborted(&LI); - - Size = TypeSize::getFixed(Size.getKnownMinValue() * VScale); - } + if (Size.isScalable()) + return PI.setAborted(&LI); return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(), LI.isVolatile()); @@ -1139,14 +1133,8 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor { return PI.setAborted(&SI); TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType()); - if (StoreSize.isScalable()) { - Attribute Attr = SI.getFunction()->getFnAttribute(Attribute::VScaleRange); - unsigned VScale = Attr.isValid() ? Attr.getVScaleValue() : 0; - if (!VScale) - return PI.setAborted(&SI); - - StoreSize = TypeSize::getFixed(StoreSize.getKnownMinValue() * VScale); - } + if (StoreSize.isScalable()) + return PI.setAborted(&SI); uint64_t Size = StoreSize.getFixedValue(); @@ -1937,8 +1925,7 @@ static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) { /// ensure that we only try to convert viable values. The strategy is that we /// will peel off single element struct and array wrappings to get to an /// underlying value, and convert that value. -static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy, - unsigned VScale = 0) { +static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { if (OldTy == NewTy) return true; @@ -1952,24 +1939,8 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy, return false; } - TypeSize NewSize = DL.getTypeSizeInBits(NewTy); - TypeSize OldSize = DL.getTypeSizeInBits(OldTy); - - if (isa(NewTy) && isa(OldTy)) { - if (!VScale || NewTy->isPtrOrPtrVectorTy() || OldTy->isPtrOrPtrVectorTy() || - !VectorType::getWithSizeAndScalar(cast(NewTy), OldTy)) - return false; - - NewSize = TypeSize::getFixed(NewSize.getKnownMinValue() * VScale); - } else if (isa(OldTy) && isa(NewTy)) { - if (!VScale || NewTy->isPtrOrPtrVectorTy() || OldTy->isPtrOrPtrVectorTy() || - !VectorType::getWithSizeAndScalar(cast(OldTy), NewTy)) - return false; - - OldSize = TypeSize::getFixed(OldSize.getKnownMinValue() * VScale); - } - - if (NewSize != OldSize) + if (DL.getTypeSizeInBits(NewTy).getFixedValue() != + DL.getTypeSizeInBits(OldTy).getFixedValue()) return false; if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) return false; @@ -2019,15 +1990,7 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy, static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, Type *NewTy) { Type *OldTy = V->getType(); - -#ifndef NDEBUG - BasicBlock *BB = IRB.GetInsertBlock(); - assert(BB && BB->getParent() && "VScale unknown!"); - Attribute Attr = BB->getParent()->getFnAttribute(Attribute::VScaleRange); - unsigned VScale = Attr.isValid() ? Attr.getVScaleValue() : 0; - assert(canConvertValue(DL, OldTy, NewTy, VScale) && - "Value not convertable to type"); -#endif + assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type"); if (OldTy == NewTy) return V; @@ -2071,18 +2034,6 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, } } - if (isa(NewTy) && isa(OldTy)) { - auto *Ty = VectorType::getWithSizeAndScalar(cast(NewTy), OldTy); - V = IRB.CreateInsertVector(Ty, PoisonValue::get(Ty), V, IRB.getInt64(0)); - return IRB.CreateBitCast(V, NewTy); - } - - if (isa(NewTy) && isa(OldTy)) { - auto *Ty = VectorType::getWithSizeAndScalar(cast(OldTy), NewTy); - V = IRB.CreateBitCast(V, Ty); - return IRB.CreateExtractVector(NewTy, V, IRB.getInt64(0)); - } - return IRB.CreateBitCast(V, NewTy); } @@ -2093,8 +2044,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, VectorType *Ty, uint64_t ElementSize, - const DataLayout &DL, - unsigned VScale) { + const DataLayout &DL) { // First validate the slice offsets. uint64_t BeginOffset = std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset(); @@ -2138,7 +2088,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, assert(LTy->isIntegerTy()); LTy = SplitIntTy; } - if (!canConvertValue(DL, SliceTy, LTy, VScale)) + if (!canConvertValue(DL, SliceTy, LTy)) return false; } else if (StoreInst *SI = dyn_cast(U->getUser())) { if (SI->isVolatile()) @@ -2151,7 +2101,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, assert(STy->isIntegerTy()); STy = SplitIntTy; } - if (!canConvertValue(DL, STy, SliceTy, VScale)) + if (!canConvertValue(DL, STy, SliceTy)) return false; } else { return false; @@ -2166,7 +2116,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, /// (and thus isVectorPromotionViable) over all slices of the alloca for the /// given VectorType. static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy, - const DataLayout &DL, unsigned VScale) { + const DataLayout &DL) { uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue(); @@ -2179,11 +2129,11 @@ static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy, ElementSize /= 8; for (const Slice &S : P) - if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale)) + if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL)) return false; for (const Slice *S : P.splitSliceTails()) - if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale)) + if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL)) return false; return true; @@ -2198,7 +2148,7 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, SmallVectorImpl &CandidateTys, bool HaveCommonEltTy, Type *CommonEltTy, bool HaveVecPtrTy, bool HaveCommonVecPtrTy, - VectorType *CommonVecPtrTy, unsigned VScale) { + VectorType *CommonVecPtrTy) { // If we didn't find a vector type, nothing to do here. if (CandidateTys.empty()) return nullptr; @@ -2274,7 +2224,7 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, }); for (VectorType *VTy : CandidateTys) - if (checkVectorTypeForPromotion(P, VTy, DL, VScale)) + if (checkVectorTypeForPromotion(P, VTy, DL)) return VTy; return nullptr; @@ -2285,7 +2235,7 @@ static VectorType *createAndCheckVectorTypesForPromotion( function_ref CheckCandidateType, Partition &P, const DataLayout &DL, SmallVectorImpl &CandidateTys, bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy, - bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) { + bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy) { [[maybe_unused]] VectorType *OriginalElt = CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr; // Consider additional vector types where the element type size is a @@ -2310,9 +2260,9 @@ static VectorType *createAndCheckVectorTypesForPromotion( } } - return checkVectorTypesForPromotion( - P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, - HaveCommonVecPtrTy, CommonVecPtrTy, VScale); + return checkVectorTypesForPromotion(P, DL, CandidateTys, HaveCommonEltTy, + CommonEltTy, HaveVecPtrTy, + HaveCommonVecPtrTy, CommonVecPtrTy); } /// Test whether the given alloca partitioning and range of slices can be @@ -2324,8 +2274,7 @@ static VectorType *createAndCheckVectorTypesForPromotion( /// SSA value. We only can ensure this for a limited set of operations, and we /// don't want to do the rewrites unless we are confident that the result will /// be promotable, so we have an early test here. -static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL, - unsigned VScale) { +static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { // Collect the candidate types for vector-based promotion. Also track whether // we have different element types. SmallVector CandidateTys; @@ -2337,7 +2286,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL, bool HaveCommonEltTy = true; bool HaveCommonVecPtrTy = true; auto CheckCandidateType = [&](Type *Ty) { - if (auto *VTy = dyn_cast(Ty)) { + if (auto *VTy = dyn_cast(Ty)) { // Return if bitcast to vectors is different for total size in bits. if (!CandidateTys.empty()) { VectorType *V = CandidateTys[0]; @@ -2392,14 +2341,14 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL, if (auto *VTy = createAndCheckVectorTypesForPromotion( LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, - HaveCommonVecPtrTy, CommonVecPtrTy, VScale)) + HaveCommonVecPtrTy, CommonVecPtrTy)) return VTy; CandidateTys.clear(); return createAndCheckVectorTypesForPromotion( DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy, - CommonVecPtrTy, VScale); + CommonVecPtrTy); } /// Test whether a slice of an alloca is valid for integer widening. @@ -2436,8 +2385,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S, if (LI->isVolatile()) return false; // We can't handle loads that extend past the allocated memory. - TypeSize LoadSize = DL.getTypeStoreSize(LI->getType()); - if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size) + if (DL.getTypeStoreSize(LI->getType()).getFixedValue() > Size) return false; // So far, AllocaSliceRewriter does not support widening split slice tails // in rewriteIntegerLoad. @@ -2462,8 +2410,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S, if (SI->isVolatile()) return false; // We can't handle stores that extend past the allocated memory. - TypeSize StoreSize = DL.getTypeStoreSize(ValueTy); - if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size) + if (DL.getTypeStoreSize(ValueTy).getFixedValue() > Size) return false; // So far, AllocaSliceRewriter does not support widening split slice tails // in rewriteIntegerStore. @@ -2936,6 +2883,8 @@ class AllocaSliceRewriter : public InstVisitor { Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8) : LI.getType(); + const bool IsLoadPastEnd = + DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize; bool IsPtrAdjusted = false; Value *V; if (VecTy) { @@ -2945,9 +2894,8 @@ class AllocaSliceRewriter : public InstVisitor { } else if (NewBeginOffset == NewAllocaBeginOffset && NewEndOffset == NewAllocaEndOffset && (canConvertValue(DL, NewAllocaTy, TargetTy) || - (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() && - DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize && - !LI.isVolatile()))) { + (IsLoadPastEnd && NewAllocaTy->isIntegerTy() && + TargetTy->isIntegerTy() && !LI.isVolatile()))) { Value *NewPtr = getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile()); LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), NewPtr, @@ -3120,8 +3068,7 @@ class AllocaSliceRewriter : public InstVisitor { if (AllocaInst *AI = dyn_cast(V->stripInBoundsOffsets())) Pass.PostPromotionWorklist.insert(AI); - TypeSize StoreSize = DL.getTypeStoreSize(V->getType()); - if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) { + if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedValue()) { assert(!SI.isVolatile()); assert(V->getType()->isIntegerTy() && "Only integer type loads and stores are split"); @@ -4897,19 +4844,14 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Type *SliceTy = nullptr; VectorType *SliceVecTy = nullptr; const DataLayout &DL = AI.getDataLayout(); - Attribute Attr = AI.getFunction()->getFnAttribute(Attribute::VScaleRange); - unsigned VScale = Attr.isValid() ? Attr.getVScaleValue() : 0; - std::pair CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()); // Do all uses operate on the same type? - if (CommonUseTy.first) { - TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first); - if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) { + if (CommonUseTy.first) + if (DL.getTypeAllocSize(CommonUseTy.first).getFixedValue() >= P.size()) { SliceTy = CommonUseTy.first; SliceVecTy = dyn_cast(SliceTy); } - } // If not, can we find an appropriate subtype in the original allocated type? if (!SliceTy) if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), @@ -4930,12 +4872,12 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // If the common use types are not viable for promotion then attempt to find // another type that is viable. - if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL, VScale)) + if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL)) if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), P.beginOffset(), P.size())) { VectorType *TypePartitionVecTy = dyn_cast(TypePartitionTy); if (TypePartitionVecTy && - checkVectorTypeForPromotion(P, TypePartitionVecTy, DL, VScale)) + checkVectorTypeForPromotion(P, TypePartitionVecTy, DL)) SliceTy = TypePartitionTy; } @@ -4946,7 +4888,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL); VectorType *VecTy = - IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale); + IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL); if (VecTy) SliceTy = VecTy; diff --git a/llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll b/llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll deleted file mode 100644 index b4df64a4e45c..000000000000 --- a/llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll +++ /dev/null @@ -1,248 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG -; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" - -; This test checks that SROA runs mem2reg on scalable vectors. - -define @alloca_nxv16i1( %pg) vscale_range(1) { -; CHECK-LABEL: @alloca_nxv16i1( -; CHECK-NEXT: ret [[PG:%.*]] -; - %pg.addr = alloca - store %pg, ptr %pg.addr - %1 = load , ptr %pg.addr - ret %1 -} - -define @alloca_nxv16i8( %vec) vscale_range(1) { -; CHECK-LABEL: @alloca_nxv16i8( -; CHECK-NEXT: ret [[VEC:%.*]] -; - %vec.addr = alloca - store %vec, ptr %vec.addr - %1 = load , ptr %vec.addr - ret %1 -} - -; Test scalable alloca that can't be promoted. Mem2Reg only considers -; non-volatile loads and stores for promotion. -define @unpromotable_alloca( %vec) vscale_range(1) { -; CHECK-LABEL: @unpromotable_alloca( -; CHECK-NEXT: [[VEC_ADDR:%.*]] = alloca , align 16 -; CHECK-NEXT: store volatile [[VEC:%.*]], ptr [[VEC_ADDR]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = load volatile , ptr [[VEC_ADDR]], align 16 -; CHECK-NEXT: ret [[TMP1]] -; - %vec.addr = alloca - store volatile %vec, ptr %vec.addr - %1 = load volatile , ptr %vec.addr - ret %1 -} - -; Test we bail out when using an alloca of a fixed-length vector (VLS) that was -; bitcasted to a scalable vector. -define @cast_alloca_to_svint32_t( %type.coerce) vscale_range(1) { -; CHECK-LABEL: @cast_alloca_to_svint32_t( -; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.nxv4i32( [[TYPE_COERCE:%.*]], i64 0) -; CHECK-NEXT: [[TYPE_0_VEC_EXPAND:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TYPE_0_VECBLEND:%.*]] = select <16 x i1> , <16 x i32> [[TYPE_0_VEC_EXPAND]], <16 x i32> undef -; CHECK-NEXT: [[TYPE_ADDR_0_VEC_EXTRACT:%.*]] = shufflevector <16 x i32> [[TYPE_0_VECBLEND]], <16 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.insert.nxv4i32.v4i32( poison, <4 x i32> [[TYPE_ADDR_0_VEC_EXTRACT]], i64 0) -; CHECK-NEXT: ret [[TMP2]] -; - %type = alloca <16 x i32> - %type.addr = alloca <16 x i32> - store %type.coerce, ptr %type - %type1 = load <16 x i32>, ptr %type - store <16 x i32> %type1, ptr %type.addr - %1 = load <16 x i32>, ptr %type.addr - %2 = load , ptr %type.addr - ret %2 -} - -; When casting from VLA to VLS via memory check we bail out when producing a -; GEP where the element type is a scalable vector. -define @cast_alloca_from_svint32_t() vscale_range(1) { -; CHECK-LABEL: @cast_alloca_from_svint32_t( -; CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 -; CHECK-NEXT: store <16 x i32> undef, ptr [[RETVAL_COERCE]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = load , ptr [[RETVAL_COERCE]], align 16 -; CHECK-NEXT: ret [[TMP1]] -; - %retval = alloca <16 x i32> - %retval.coerce = alloca - call void @llvm.memcpy.p0.p0.i64(ptr align 16 %retval.coerce, ptr align 16 %retval, i64 64, i1 false) - %1 = load , ptr %retval.coerce - ret %1 -} - -; Test we bail out when using an alloca of a fixed-length vector (VLS) that was -; bitcasted to a scalable vector. -define void @select_load_alloca_to_svdouble_t() vscale_range(1) { -; CHECK-LABEL: @select_load_alloca_to_svdouble_t( -; CHECK-NEXT: [[Z:%.*]] = alloca <16 x half>, align 32 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 0, 0 -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], ptr [[Z]], ptr null -; CHECK-NEXT: [[VAL:%.*]] = load , ptr [[COND]], align 16 -; CHECK-NEXT: ret void -; - %z = alloca <16 x half> - %cmp = icmp eq i32 0, 0 - %cond = select i1 %cmp, ptr %z, ptr null - %val = load , ptr %cond, align 16 - ret void -} - -define void @select_store_alloca_to_svdouble_t( %val) vscale_range(1) { -; CHECK-LABEL: @select_store_alloca_to_svdouble_t( -; CHECK-NEXT: [[Z:%.*]] = alloca <16 x half>, align 32 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 0, 0 -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], ptr [[Z]], ptr null -; CHECK-NEXT: store [[VAL:%.*]], ptr [[COND]], align 16 -; CHECK-NEXT: ret void -; - %z = alloca <16 x half> - %cmp = icmp eq i32 0, 0 - %cond = select i1 %cmp, ptr %z, ptr null - store %val, ptr %cond, align 16 - ret void -} - -define <4 x i32> @fixed_alloca_fixed_from_scalable( %a) vscale_range(1) { -; CHECK-LABEL: @fixed_alloca_fixed_from_scalable( -; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.nxv4i32( [[A:%.*]], i64 0) -; CHECK-NEXT: ret <4 x i32> [[TMP1]] -; - %tmp = alloca <4 x i32> - store %a, ptr %tmp - %cast = load <4 x i32>, ptr %tmp - ret <4 x i32> %cast -} - -define <2 x i8> @fixed_alloca_fixed_from_scalable_requires_bitcast( %a) vscale_range(1) { -; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_requires_bitcast( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [[A:%.*]] to -; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8( [[TMP1]], i64 0) -; CHECK-NEXT: ret <2 x i8> [[TMP2]] -; - %tmp = alloca <2 x i8> - store %a, ptr %tmp - %cast = load <2 x i8>, ptr %tmp - ret <2 x i8> %cast -} - -define @fixed_alloca_scalable_from_fixed(<4 x i32> %a) vscale_range(1) { -; CHECK-LABEL: @fixed_alloca_scalable_from_fixed( -; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vector.insert.nxv4i32.v4i32( poison, <4 x i32> [[A:%.*]], i64 0) -; CHECK-NEXT: ret [[TMP1]] -; - %tmp = alloca <4 x i32> - store <4 x i32> %a, ptr %tmp - %cast = load , ptr %tmp - ret %cast -} - -define @fixed_alloca_scalable_from_fixed_requires_bitcast(<2 x i8> %a) vscale_range(1) { -; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_requires_bitcast( -; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vector.insert.nxv2i8.v2i8( poison, <2 x i8> [[A:%.*]], i64 0) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast [[TMP1]] to -; CHECK-NEXT: ret [[TMP2]] -; - %tmp = alloca <2 x i8> - store <2 x i8> %a, ptr %tmp - %cast = load , ptr %tmp - ret %cast -} - -define <4 x i32> @scalable_alloca_fixed_from_scalable( %a) vscale_range(1) { -; CHECK-LABEL: @scalable_alloca_fixed_from_scalable( -; CHECK-NEXT: [[TMP:%.*]] = alloca , align 16 -; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 -; CHECK-NEXT: [[CAST:%.*]] = load <4 x i32>, ptr [[TMP]], align 16 -; CHECK-NEXT: ret <4 x i32> [[CAST]] -; - %tmp = alloca - store %a, ptr %tmp - %cast = load <4 x i32>, ptr %tmp - ret <4 x i32> %cast -} - -define @scalable_alloca_scalable_from_fixed(<4 x i32> %a) vscale_range(1) { -; CHECK-LABEL: @scalable_alloca_scalable_from_fixed( -; CHECK-NEXT: [[TMP:%.*]] = alloca , align 16 -; CHECK-NEXT: store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16 -; CHECK-NEXT: [[CAST:%.*]] = load , ptr [[TMP]], align 16 -; CHECK-NEXT: ret [[CAST]] -; - %tmp = alloca - store <4 x i32> %a, ptr %tmp - %cast = load , ptr %tmp - ret %cast -} - -define i16 @scalar_alloca_scalar_from_scalable( %a) vscale_range(1) { -; CHECK-LABEL: @scalar_alloca_scalar_from_scalable( -; CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 -; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 2 -; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load i16, ptr [[TMP]], align 2 -; CHECK-NEXT: ret i16 [[TMP_0_CAST]] -; - %tmp = alloca i16 - store %a, ptr %tmp - %cast = load i16, ptr %tmp - ret i16 %cast -} - -define @scalar_alloca_scalable_from_scalar(i16 %a) vscale_range(1) { -; CHECK-LABEL: @scalar_alloca_scalable_from_scalar( -; CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 -; CHECK-NEXT: store i16 [[A:%.*]], ptr [[TMP]], align 2 -; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 2 -; CHECK-NEXT: ret [[TMP_0_CAST]] -; - %tmp = alloca i16 - store i16 %a, ptr %tmp - %cast = load , ptr %tmp - ret %cast -} - -define { <2 x i32>, <2 x i32> } @fixed_struct_alloca_fixed_from_scalable( %a) vscale_range(1) { -; CHECK-LABEL: @fixed_struct_alloca_fixed_from_scalable( -; CHECK-NEXT: [[TMP:%.*]] = alloca { <2 x i32>, <2 x i32> }, align 8 -; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 8 -; CHECK-NEXT: [[TMP_0_CAST_FCA_0_LOAD:%.*]] = load <2 x i32>, ptr [[TMP]], align 8 -; CHECK-NEXT: [[CAST_FCA_0_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP_0_CAST_FCA_0_LOAD]], 0 -; CHECK-NEXT: [[TMP_8_CAST_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 8 -; CHECK-NEXT: [[TMP_8_CAST_FCA_1_LOAD:%.*]] = load <2 x i32>, ptr [[TMP_8_CAST_FCA_1_GEP_SROA_IDX]], align 8 -; CHECK-NEXT: [[CAST_FCA_1_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[CAST_FCA_0_INSERT]], <2 x i32> [[TMP_8_CAST_FCA_1_LOAD]], 1 -; CHECK-NEXT: ret { <2 x i32>, <2 x i32> } [[CAST_FCA_1_INSERT]] -; - %tmp = alloca { <2 x i32>, <2 x i32> } - store %a, ptr %tmp - %cast = load { <2 x i32>, <2 x i32> }, ptr %tmp - ret { <2 x i32>, <2 x i32> } %cast -} - -define @fixed_struct_alloca_scalable_from_fixed({ <2 x ptr>, <2 x ptr> } %a) vscale_range(1) { -; CHECK-LABEL: @fixed_struct_alloca_scalable_from_fixed( -; CHECK-NEXT: [[TMP:%.*]] = alloca { <2 x ptr>, <2 x ptr> }, align 16 -; CHECK-NEXT: [[A_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A:%.*]], 0 -; CHECK-NEXT: store <2 x ptr> [[A_FCA_0_EXTRACT]], ptr [[TMP]], align 16 -; CHECK-NEXT: [[A_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A]], 1 -; CHECK-NEXT: [[TMP_16_A_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 16 -; CHECK-NEXT: store <2 x ptr> [[A_FCA_1_EXTRACT]], ptr [[TMP_16_A_FCA_1_GEP_SROA_IDX]], align 16 -; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 16 -; CHECK-NEXT: ret [[TMP_0_CAST]] -; - %tmp = alloca { <2 x ptr>, <2 x ptr> } - store { <2 x ptr>, <2 x ptr> } %a, ptr %tmp - %cast = load , ptr %tmp - ret %cast -} - -declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-MODIFY-CFG: {{.*}} -; CHECK-PRESERVE-CFG: {{.*}} diff --git a/llvm/test/Transforms/SROA/scalable-vectors.ll b/llvm/test/Transforms/SROA/scalable-vectors.ll index 9d6dec34b35b..d892883ce9dc 100644 --- a/llvm/test/Transforms/SROA/scalable-vectors.ll +++ b/llvm/test/Transforms/SROA/scalable-vectors.ll @@ -2,8 +2,6 @@ ; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG ; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" - ; This test checks that SROA runs mem2reg on scalable vectors. define @alloca_nxv16i1( %pg) { @@ -112,146 +110,6 @@ define void @select_store_alloca_to_svdouble_t( %val) { ret void } -define <4 x i32> @fixed_alloca_fixed_from_scalable( %a) { -; CHECK-LABEL: @fixed_alloca_fixed_from_scalable( -; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 -; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[TMP]], align 16 -; CHECK-NEXT: ret <4 x i32> [[TMP1]] -; - %tmp = alloca <4 x i32> - store %a, ptr %tmp - %cast = load <4 x i32>, ptr %tmp - ret <4 x i32> %cast -} - -define <2 x i8> @fixed_alloca_fixed_from_scalable_requires_bitcast( %a) { -; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_requires_bitcast( -; CHECK-NEXT: [[TMP:%.*]] = alloca <2 x i8>, align 2 -; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr [[TMP]], align 2 -; CHECK-NEXT: ret <2 x i8> [[TMP2]] -; - %tmp = alloca <2 x i8> - store %a, ptr %tmp - %cast = load <2 x i8>, ptr %tmp - ret <2 x i8> %cast -} - -define @fixed_alloca_scalable_from_fixed(<4 x i32> %a) { -; CHECK-LABEL: @fixed_alloca_scalable_from_fixed( -; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 -; CHECK-NEXT: store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = load , ptr [[TMP]], align 16 -; CHECK-NEXT: ret [[TMP1]] -; - %tmp = alloca <4 x i32> - store <4 x i32> %a, ptr %tmp - %cast = load , ptr %tmp - ret %cast -} - -define @fixed_alloca_scalable_from_fixed_requires_bitcast(<2 x i8> %a) { -; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_requires_bitcast( -; CHECK-NEXT: [[TMP:%.*]] = alloca <2 x i8>, align 2 -; CHECK-NEXT: store <2 x i8> [[A:%.*]], ptr [[TMP]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load , ptr [[TMP]], align 2 -; CHECK-NEXT: ret [[TMP2]] -; - %tmp = alloca <2 x i8> - store <2 x i8> %a, ptr %tmp - %cast = load , ptr %tmp - ret %cast -} - -define <4 x i32> @scalable_alloca_fixed_from_scalable( %a) { -; CHECK-LABEL: @scalable_alloca_fixed_from_scalable( -; CHECK-NEXT: [[TMP:%.*]] = alloca , align 16 -; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 -; CHECK-NEXT: [[CAST:%.*]] = load <4 x i32>, ptr [[TMP]], align 16 -; CHECK-NEXT: ret <4 x i32> [[CAST]] -; - %tmp = alloca - store %a, ptr %tmp - %cast = load <4 x i32>, ptr %tmp - ret <4 x i32> %cast -} - -define @scalable_alloca_scalable_from_fixed(<4 x i32> %a) { -; CHECK-LABEL: @scalable_alloca_scalable_from_fixed( -; CHECK-NEXT: [[TMP:%.*]] = alloca , align 16 -; CHECK-NEXT: store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16 -; CHECK-NEXT: [[CAST:%.*]] = load , ptr [[TMP]], align 16 -; CHECK-NEXT: ret [[CAST]] -; - %tmp = alloca - store <4 x i32> %a, ptr %tmp - %cast = load , ptr %tmp - ret %cast -} - -define i16 @scalar_alloca_scalar_from_scalable( %a) { -; CHECK-LABEL: @scalar_alloca_scalar_from_scalable( -; CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 -; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 2 -; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load i16, ptr [[TMP]], align 2 -; CHECK-NEXT: ret i16 [[TMP_0_CAST]] -; - %tmp = alloca i16 - store %a, ptr %tmp - %cast = load i16, ptr %tmp - ret i16 %cast -} - -define @scalar_alloca_scalable_from_scalar(i16 %a) { -; CHECK-LABEL: @scalar_alloca_scalable_from_scalar( -; CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 -; CHECK-NEXT: store i16 [[A:%.*]], ptr [[TMP]], align 2 -; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 2 -; CHECK-NEXT: ret [[TMP_0_CAST]] -; - %tmp = alloca i16 - store i16 %a, ptr %tmp - %cast = load , ptr %tmp - ret %cast -} - -define { <2 x i32>, <2 x i32> } @fixed_struct_alloca_fixed_from_scalable( %a) { -; CHECK-LABEL: @fixed_struct_alloca_fixed_from_scalable( -; CHECK-NEXT: [[TMP:%.*]] = alloca { <2 x i32>, <2 x i32> }, align 8 -; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 -; CHECK-NEXT: [[CAST_FCA_0_GEP:%.*]] = getelementptr inbounds { <2 x i32>, <2 x i32> }, ptr [[TMP]], i32 0, i32 0 -; CHECK-NEXT: [[TMP_0_CAST_FCA_0_LOAD:%.*]] = load <2 x i32>, ptr [[CAST_FCA_0_GEP]], align 8 -; CHECK-NEXT: [[CAST_FCA_0_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP_0_CAST_FCA_0_LOAD]], 0 -; CHECK-NEXT: [[TMP_8_CAST_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds { <2 x i32>, <2 x i32> }, ptr [[TMP]], i32 0, i32 1 -; CHECK-NEXT: [[TMP_8_CAST_FCA_1_LOAD:%.*]] = load <2 x i32>, ptr [[TMP_8_CAST_FCA_1_GEP_SROA_IDX]], align 8 -; CHECK-NEXT: [[CAST_FCA_1_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[CAST_FCA_0_INSERT]], <2 x i32> [[TMP_8_CAST_FCA_1_LOAD]], 1 -; CHECK-NEXT: ret { <2 x i32>, <2 x i32> } [[CAST_FCA_1_INSERT]] -; - %tmp = alloca { <2 x i32>, <2 x i32> } - store %a, ptr %tmp - %cast = load { <2 x i32>, <2 x i32> }, ptr %tmp - ret { <2 x i32>, <2 x i32> } %cast -} - -define @fixed_struct_alloca_scalable_from_fixed({ <2 x ptr>, <2 x ptr> } %a) { -; CHECK-LABEL: @fixed_struct_alloca_scalable_from_fixed( -; CHECK-NEXT: [[TMP:%.*]] = alloca { <2 x ptr>, <2 x ptr> }, align 16 -; CHECK-NEXT: [[A_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A:%.*]], 0 -; CHECK-NEXT: [[A_FCA_0_GEP:%.*]] = getelementptr inbounds { <2 x ptr>, <2 x ptr> }, ptr [[TMP]], i32 0, i32 0 -; CHECK-NEXT: store <2 x ptr> [[A_FCA_0_EXTRACT]], ptr [[A_FCA_0_GEP]], align 16 -; CHECK-NEXT: [[A_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A]], 1 -; CHECK-NEXT: [[TMP_16_A_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds { <2 x ptr>, <2 x ptr> }, ptr [[TMP]], i32 0, i32 1 -; CHECK-NEXT: store <2 x ptr> [[A_FCA_1_EXTRACT]], ptr [[TMP_16_A_FCA_1_GEP_SROA_IDX]], align 16 -; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 32 -; CHECK-NEXT: ret [[TMP_0_CAST]] -; - %tmp = alloca { <2 x ptr>, <2 x ptr> } - store { <2 x ptr>, <2 x ptr> } %a, ptr %tmp - %cast = load , ptr %tmp - ret %cast -} - declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK-MODIFY-CFG: {{.*}} From b6b0ec69250e7f57cc93fa63c3f2ba74dc8d4d9f Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Wed, 11 Jun 2025 11:02:32 +0100 Subject: [PATCH 2/2] [LLVM][SROA] Teach SROA how to "bitcast" between fixed and scalable vectors. (#130973) For function whose vscale_range is limited to a single value we can size scalable vectors. This aids SROA by allowing scalable vector load and store operations to be considered for replacement whereby bitcasts through memory can be replaced by vector insert or extract operations. --- .../CodeGen/attr-arm-sve-vector-bits-cast.c | 23 +- llvm/include/llvm/IR/DerivedTypes.h | 17 + llvm/include/llvm/IR/Function.h | 4 + llvm/lib/IR/Function.cpp | 12 + llvm/lib/Transforms/Scalar/SROA.cpp | 165 +++++++-- .../scalable-vectors-with-known-vscale.ll | 349 ++++++++++++++++++ llvm/test/Transforms/SROA/scalable-vectors.ll | 223 ++++++++++- 7 files changed, 738 insertions(+), 55 deletions(-) create mode 100644 llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c index e1e2220f94d6..fcd4314249ff 100644 --- a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c +++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c @@ -62,10 +62,7 @@ fixed_bool_t from_svbool_t(svbool_t type) { // CHECK-LABEL: @lax_cast( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca <16 x i32>, align 64 -// CHECK-NEXT: [[TYPE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( [[TYPE_COERCE:%.*]], i64 0) -// CHECK-NEXT: store <16 x i32> [[TYPE]], ptr [[SAVED_VALUE]], align 64, !tbaa [[TBAA6:![0-9]+]] -// CHECK-NEXT: [[TMP0:%.*]] = load , ptr [[SAVED_VALUE]], align 64, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[TYPE_COERCE:%.*]] to // CHECK-NEXT: ret [[TMP0]] // svint64_t lax_cast(fixed_int32_t type) { @@ -74,9 +71,9 @@ svint64_t lax_cast(fixed_int32_t type) { // CHECK-LABEL: @to_svint32_t__from_gnu_int32_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA6]] -// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> [[TYPE]], i64 0) -// CHECK-NEXT: ret [[CASTSCALABLESVE]] +// CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA2:![0-9]+]] +// CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> [[TYPE]], i64 0) +// CHECK-NEXT: ret [[CAST_SCALABLE]] // svint32_t to_svint32_t__from_gnu_int32_t(gnu_int32_t type) { return type; @@ -84,8 +81,8 @@ svint32_t to_svint32_t__from_gnu_int32_t(gnu_int32_t type) { // CHECK-LABEL: @from_svint32_t__to_gnu_int32_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( [[TYPE:%.*]], i64 0) -// CHECK-NEXT: store <16 x i32> [[CASTFIXEDSVE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA6]] +// CHECK-NEXT: [[CAST_FIXED:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( [[TYPE:%.*]], i64 0) +// CHECK-NEXT: store <16 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA2]] // CHECK-NEXT: ret void // gnu_int32_t from_svint32_t__to_gnu_int32_t(svint32_t type) { @@ -94,9 +91,9 @@ gnu_int32_t from_svint32_t__to_gnu_int32_t(svint32_t type) { // CHECK-LABEL: @to_fixed_int32_t__from_gnu_int32_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA6]] -// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> [[TYPE]], i64 0) -// CHECK-NEXT: ret [[CASTSCALABLESVE]] +// CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA2]] +// CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> [[TYPE]], i64 0) +// CHECK-NEXT: ret [[CAST_SCALABLE]] // fixed_int32_t to_fixed_int32_t__from_gnu_int32_t(gnu_int32_t type) { return type; @@ -105,7 +102,7 @@ fixed_int32_t to_fixed_int32_t__from_gnu_int32_t(gnu_int32_t type) { // CHECK-LABEL: @from_fixed_int32_t__to_gnu_int32_t( // CHECK-NEXT: entry: // CHECK-NEXT: [[TYPE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( [[TYPE_COERCE:%.*]], i64 0) -// CHECK-NEXT: store <16 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA6]] +// CHECK-NEXT: store <16 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA2]] // CHECK-NEXT: ret void // gnu_int32_t from_fixed_int32_t__to_gnu_int32_t(fixed_int32_t type) { diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h index b44f4f8c8687..71ab03aa1ae1 100644 --- a/llvm/include/llvm/IR/DerivedTypes.h +++ b/llvm/include/llvm/IR/DerivedTypes.h @@ -545,6 +545,23 @@ class VectorType : public Type { return VectorType::get(VTy->getElementType(), EltCnt * 2); } + /// This static method attempts to construct a VectorType with the same + /// size-in-bits as SizeTy but with an element type that matches the scalar + /// type of EltTy. The VectorType is returned on success, nullptr otherwise. + static VectorType *getWithSizeAndScalar(VectorType *SizeTy, Type *EltTy) { + if (SizeTy->getScalarType() == EltTy->getScalarType()) + return SizeTy; + + unsigned EltSize = EltTy->getScalarSizeInBits(); + if (!SizeTy->getPrimitiveSizeInBits().isKnownMultipleOf(EltSize)) + return nullptr; + + ElementCount EC = SizeTy->getElementCount() + .multiplyCoefficientBy(SizeTy->getScalarSizeInBits()) + .divideCoefficientBy(EltSize); + return VectorType::get(EltTy->getScalarType(), EC); + } + /// Return true if the specified type is valid as a element type. static bool isValidElementType(Type *ElemTy); diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h index e7afcbd31420..b1594b5f03d8 100644 --- a/llvm/include/llvm/IR/Function.h +++ b/llvm/include/llvm/IR/Function.h @@ -1030,6 +1030,10 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node { /// Return value: true => null pointer dereference is not undefined. bool nullPointerIsDefined() const; + /// Return the value for vscale based on the vscale_range attribute or 0 when + /// unknown. + unsigned getVScaleValue() const; + private: void allocHungoffUselist(); template void setHungoffOperand(Constant *C); diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 9c5dd5aeb92e..58f7b6f146a6 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -1184,6 +1184,18 @@ bool Function::nullPointerIsDefined() const { return hasFnAttribute(Attribute::NullPointerIsValid); } +unsigned Function::getVScaleValue() const { + Attribute Attr = getFnAttribute(Attribute::VScaleRange); + if (!Attr.isValid()) + return 0; + + unsigned VScale = Attr.getVScaleRangeMin(); + if (VScale && VScale == Attr.getVScaleRangeMax()) + return VScale; + + return 0; +} + bool llvm::NullPointerIsDefined(const Function *F, unsigned AS) { if (F && F->nullPointerIsDefined()) return true; diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index e88c130cccf2..5c0f1fcefa52 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -1118,8 +1118,13 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor { return PI.setAborted(&LI); TypeSize Size = DL.getTypeStoreSize(LI.getType()); - if (Size.isScalable()) - return PI.setAborted(&LI); + if (Size.isScalable()) { + unsigned VScale = LI.getFunction()->getVScaleValue(); + if (!VScale) + return PI.setAborted(&LI); + + Size = TypeSize::getFixed(Size.getKnownMinValue() * VScale); + } return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(), LI.isVolatile()); @@ -1133,8 +1138,13 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor { return PI.setAborted(&SI); TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType()); - if (StoreSize.isScalable()) - return PI.setAborted(&SI); + if (StoreSize.isScalable()) { + unsigned VScale = SI.getFunction()->getVScaleValue(); + if (!VScale) + return PI.setAborted(&SI); + + StoreSize = TypeSize::getFixed(StoreSize.getKnownMinValue() * VScale); + } uint64_t Size = StoreSize.getFixedValue(); @@ -1925,7 +1935,8 @@ static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) { /// ensure that we only try to convert viable values. The strategy is that we /// will peel off single element struct and array wrappings to get to an /// underlying value, and convert that value. -static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { +static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy, + unsigned VScale = 0) { if (OldTy == NewTy) return true; @@ -1939,8 +1950,35 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { return false; } - if (DL.getTypeSizeInBits(NewTy).getFixedValue() != - DL.getTypeSizeInBits(OldTy).getFixedValue()) + TypeSize NewSize = DL.getTypeSizeInBits(NewTy); + TypeSize OldSize = DL.getTypeSizeInBits(OldTy); + + if ((isa(NewTy) && isa(OldTy)) || + (isa(OldTy) && isa(NewTy))) { + // Conversion is only possible when the size of scalable vectors is known. + if (!VScale) + return false; + + // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within + // a single domain (either fixed or scalable). Any additional conversion + // between fixed and scalable types is handled through integer types. + auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy; + auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy; + + if (isa(NewTy)) { + if (!VectorType::getWithSizeAndScalar(cast(NewVTy), OldVTy)) + return false; + + NewSize = TypeSize::getFixed(NewSize.getKnownMinValue() * VScale); + } else { + if (!VectorType::getWithSizeAndScalar(cast(OldVTy), NewVTy)) + return false; + + OldSize = TypeSize::getFixed(OldSize.getKnownMinValue() * VScale); + } + } + + if (NewSize != OldSize) return false; if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) return false; @@ -1990,7 +2028,14 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, Type *NewTy) { Type *OldTy = V->getType(); - assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type"); + +#ifndef NDEBUG + BasicBlock *BB = IRB.GetInsertBlock(); + assert(BB && BB->getParent() && "VScale unknown!"); + unsigned VScale = BB->getParent()->getVScaleValue(); + assert(canConvertValue(DL, OldTy, NewTy, VScale) && + "Value not convertable to type"); +#endif if (OldTy == NewTy) return V; @@ -1998,13 +2043,41 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, assert(!(isa(OldTy) && isa(NewTy)) && "Integer types must be the exact same to convert."); + // A variant of bitcast that supports a mixture of fixed and scalable types + // that are know to have the same size. + auto CreateBitCastLike = [&IRB](Value *In, Type *Ty) -> Value * { + Type *InTy = In->getType(); + if (InTy == Ty) + return In; + + if (isa(InTy) && isa(Ty)) { + // For vscale_range(2) expand <4 x i32> to --> + // <4 x i32> to to + auto *VTy = VectorType::getWithSizeAndScalar(cast(Ty), InTy); + return IRB.CreateBitCast(IRB.CreateInsertVector(VTy, + PoisonValue::get(VTy), In, + IRB.getInt64(0)), + Ty); + } + + if (isa(InTy) && isa(Ty)) { + // For vscale_range(2) expand to <4 x i32> --> + // to to <4 x i32> + auto *VTy = VectorType::getWithSizeAndScalar(cast(InTy), Ty); + return IRB.CreateExtractVector(Ty, IRB.CreateBitCast(In, VTy), + IRB.getInt64(0)); + } + + return IRB.CreateBitCast(In, Ty); + }; + // See if we need inttoptr for this type pair. May require additional bitcast. if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) { // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8* // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*> // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*> // Directly handle i64 to i8* - return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), + return IRB.CreateIntToPtr(CreateBitCastLike(V, DL.getIntPtrType(NewTy)), NewTy); } @@ -2014,7 +2087,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32> // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32> // Expand i8* to i64 --> i8* to i64 to i64 - return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), + return CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), NewTy); } @@ -2029,12 +2102,14 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, // size. if (OldAS != NewAS) { assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS)); - return IRB.CreateIntToPtr(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), - NewTy); + return IRB.CreateIntToPtr( + CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), + DL.getIntPtrType(NewTy)), + NewTy); } } - return IRB.CreateBitCast(V, NewTy); + return CreateBitCastLike(V, NewTy); } /// Test whether the given slice use can be promoted to a vector. @@ -2044,7 +2119,8 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, VectorType *Ty, uint64_t ElementSize, - const DataLayout &DL) { + const DataLayout &DL, + unsigned VScale) { // First validate the slice offsets. uint64_t BeginOffset = std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset(); @@ -2088,7 +2164,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, assert(LTy->isIntegerTy()); LTy = SplitIntTy; } - if (!canConvertValue(DL, SliceTy, LTy)) + if (!canConvertValue(DL, SliceTy, LTy, VScale)) return false; } else if (StoreInst *SI = dyn_cast(U->getUser())) { if (SI->isVolatile()) @@ -2101,7 +2177,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, assert(STy->isIntegerTy()); STy = SplitIntTy; } - if (!canConvertValue(DL, STy, SliceTy)) + if (!canConvertValue(DL, STy, SliceTy, VScale)) return false; } else { return false; @@ -2116,7 +2192,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, /// (and thus isVectorPromotionViable) over all slices of the alloca for the /// given VectorType. static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy, - const DataLayout &DL) { + const DataLayout &DL, unsigned VScale) { uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue(); @@ -2129,11 +2205,11 @@ static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy, ElementSize /= 8; for (const Slice &S : P) - if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL)) + if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale)) return false; for (const Slice *S : P.splitSliceTails()) - if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL)) + if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale)) return false; return true; @@ -2148,7 +2224,7 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, SmallVectorImpl &CandidateTys, bool HaveCommonEltTy, Type *CommonEltTy, bool HaveVecPtrTy, bool HaveCommonVecPtrTy, - VectorType *CommonVecPtrTy) { + VectorType *CommonVecPtrTy, unsigned VScale) { // If we didn't find a vector type, nothing to do here. if (CandidateTys.empty()) return nullptr; @@ -2224,7 +2300,7 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, }); for (VectorType *VTy : CandidateTys) - if (checkVectorTypeForPromotion(P, VTy, DL)) + if (checkVectorTypeForPromotion(P, VTy, DL, VScale)) return VTy; return nullptr; @@ -2235,7 +2311,7 @@ static VectorType *createAndCheckVectorTypesForPromotion( function_ref CheckCandidateType, Partition &P, const DataLayout &DL, SmallVectorImpl &CandidateTys, bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy, - bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy) { + bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) { [[maybe_unused]] VectorType *OriginalElt = CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr; // Consider additional vector types where the element type size is a @@ -2260,9 +2336,9 @@ static VectorType *createAndCheckVectorTypesForPromotion( } } - return checkVectorTypesForPromotion(P, DL, CandidateTys, HaveCommonEltTy, - CommonEltTy, HaveVecPtrTy, - HaveCommonVecPtrTy, CommonVecPtrTy); + return checkVectorTypesForPromotion( + P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, + HaveCommonVecPtrTy, CommonVecPtrTy, VScale); } /// Test whether the given alloca partitioning and range of slices can be @@ -2274,7 +2350,8 @@ static VectorType *createAndCheckVectorTypesForPromotion( /// SSA value. We only can ensure this for a limited set of operations, and we /// don't want to do the rewrites unless we are confident that the result will /// be promotable, so we have an early test here. -static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { +static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL, + unsigned VScale) { // Collect the candidate types for vector-based promotion. Also track whether // we have different element types. SmallVector CandidateTys; @@ -2286,7 +2363,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { bool HaveCommonEltTy = true; bool HaveCommonVecPtrTy = true; auto CheckCandidateType = [&](Type *Ty) { - if (auto *VTy = dyn_cast(Ty)) { + if (auto *VTy = dyn_cast(Ty)) { // Return if bitcast to vectors is different for total size in bits. if (!CandidateTys.empty()) { VectorType *V = CandidateTys[0]; @@ -2341,14 +2418,14 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { if (auto *VTy = createAndCheckVectorTypesForPromotion( LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, - HaveCommonVecPtrTy, CommonVecPtrTy)) + HaveCommonVecPtrTy, CommonVecPtrTy, VScale)) return VTy; CandidateTys.clear(); return createAndCheckVectorTypesForPromotion( DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy, - CommonVecPtrTy); + CommonVecPtrTy, VScale); } /// Test whether a slice of an alloca is valid for integer widening. @@ -2385,7 +2462,8 @@ static bool isIntegerWideningViableForSlice(const Slice &S, if (LI->isVolatile()) return false; // We can't handle loads that extend past the allocated memory. - if (DL.getTypeStoreSize(LI->getType()).getFixedValue() > Size) + TypeSize LoadSize = DL.getTypeStoreSize(LI->getType()); + if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size) return false; // So far, AllocaSliceRewriter does not support widening split slice tails // in rewriteIntegerLoad. @@ -2410,7 +2488,8 @@ static bool isIntegerWideningViableForSlice(const Slice &S, if (SI->isVolatile()) return false; // We can't handle stores that extend past the allocated memory. - if (DL.getTypeStoreSize(ValueTy).getFixedValue() > Size) + TypeSize StoreSize = DL.getTypeStoreSize(ValueTy); + if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size) return false; // So far, AllocaSliceRewriter does not support widening split slice tails // in rewriteIntegerStore. @@ -2883,8 +2962,6 @@ class AllocaSliceRewriter : public InstVisitor { Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8) : LI.getType(); - const bool IsLoadPastEnd = - DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize; bool IsPtrAdjusted = false; Value *V; if (VecTy) { @@ -2894,8 +2971,9 @@ class AllocaSliceRewriter : public InstVisitor { } else if (NewBeginOffset == NewAllocaBeginOffset && NewEndOffset == NewAllocaEndOffset && (canConvertValue(DL, NewAllocaTy, TargetTy) || - (IsLoadPastEnd && NewAllocaTy->isIntegerTy() && - TargetTy->isIntegerTy() && !LI.isVolatile()))) { + (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() && + DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize && + !LI.isVolatile()))) { Value *NewPtr = getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile()); LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), NewPtr, @@ -3068,7 +3146,8 @@ class AllocaSliceRewriter : public InstVisitor { if (AllocaInst *AI = dyn_cast(V->stripInBoundsOffsets())) Pass.PostPromotionWorklist.insert(AI); - if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedValue()) { + TypeSize StoreSize = DL.getTypeStoreSize(V->getType()); + if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) { assert(!SI.isVolatile()); assert(V->getType()->isIntegerTy() && "Only integer type loads and stores are split"); @@ -4844,14 +4923,18 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Type *SliceTy = nullptr; VectorType *SliceVecTy = nullptr; const DataLayout &DL = AI.getDataLayout(); + unsigned VScale = AI.getFunction()->getVScaleValue(); + std::pair CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()); // Do all uses operate on the same type? - if (CommonUseTy.first) - if (DL.getTypeAllocSize(CommonUseTy.first).getFixedValue() >= P.size()) { + if (CommonUseTy.first) { + TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first); + if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) { SliceTy = CommonUseTy.first; SliceVecTy = dyn_cast(SliceTy); } + } // If not, can we find an appropriate subtype in the original allocated type? if (!SliceTy) if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), @@ -4872,12 +4955,12 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // If the common use types are not viable for promotion then attempt to find // another type that is viable. - if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL)) + if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL, VScale)) if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), P.beginOffset(), P.size())) { VectorType *TypePartitionVecTy = dyn_cast(TypePartitionTy); if (TypePartitionVecTy && - checkVectorTypeForPromotion(P, TypePartitionVecTy, DL)) + checkVectorTypeForPromotion(P, TypePartitionVecTy, DL, VScale)) SliceTy = TypePartitionTy; } @@ -4888,7 +4971,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL); VectorType *VecTy = - IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL); + IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale); if (VecTy) SliceTy = VecTy; diff --git a/llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll b/llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll new file mode 100644 index 000000000000..85715e406e06 --- /dev/null +++ b/llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll @@ -0,0 +1,349 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG +; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" + +; This test checks that SROA runs mem2reg on scalable vectors. + +define @alloca_nxv16i1( %pg) vscale_range(1) { +; CHECK-LABEL: @alloca_nxv16i1( +; CHECK-NEXT: ret [[PG:%.*]] +; + %pg.addr = alloca + store %pg, ptr %pg.addr + %1 = load , ptr %pg.addr + ret %1 +} + +define @alloca_nxv16i8( %vec) vscale_range(1) { +; CHECK-LABEL: @alloca_nxv16i8( +; CHECK-NEXT: ret [[VEC:%.*]] +; + %vec.addr = alloca + store %vec, ptr %vec.addr + %1 = load , ptr %vec.addr + ret %1 +} + +; Test scalable alloca that can't be promoted. Mem2Reg only considers +; non-volatile loads and stores for promotion. +define @unpromotable_alloca( %vec) vscale_range(1) { +; CHECK-LABEL: @unpromotable_alloca( +; CHECK-NEXT: [[VEC_ADDR:%.*]] = alloca , align 16 +; CHECK-NEXT: store volatile [[VEC:%.*]], ptr [[VEC_ADDR]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load volatile , ptr [[VEC_ADDR]], align 16 +; CHECK-NEXT: ret [[TMP1]] +; + %vec.addr = alloca + store volatile %vec, ptr %vec.addr + %1 = load volatile , ptr %vec.addr + ret %1 +} + +; Test we bail out when using an alloca of a fixed-length vector (VLS) that was +; bitcasted to a scalable vector. +define @cast_alloca_to_svint32_t( %type.coerce) vscale_range(1) { +; CHECK-LABEL: @cast_alloca_to_svint32_t( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.nxv4i32( [[TYPE_COERCE:%.*]], i64 0) +; CHECK-NEXT: [[TYPE_0_VEC_EXPAND:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TYPE_0_VECBLEND:%.*]] = select <16 x i1> , <16 x i32> [[TYPE_0_VEC_EXPAND]], <16 x i32> undef +; CHECK-NEXT: [[TYPE_ADDR_0_VEC_EXTRACT:%.*]] = shufflevector <16 x i32> [[TYPE_0_VECBLEND]], <16 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.insert.nxv4i32.v4i32( poison, <4 x i32> [[TYPE_ADDR_0_VEC_EXTRACT]], i64 0) +; CHECK-NEXT: ret [[TMP2]] +; + %type = alloca <16 x i32> + %type.addr = alloca <16 x i32> + store %type.coerce, ptr %type + %type1 = load <16 x i32>, ptr %type + store <16 x i32> %type1, ptr %type.addr + %1 = load <16 x i32>, ptr %type.addr + %2 = load , ptr %type.addr + ret %2 +} + +; When casting from VLA to VLS via memory check we bail out when producing a +; GEP where the element type is a scalable vector. +define @cast_alloca_from_svint32_t() vscale_range(1) { +; CHECK-LABEL: @cast_alloca_from_svint32_t( +; CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr [[RETVAL_COERCE]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load , ptr [[RETVAL_COERCE]], align 16 +; CHECK-NEXT: ret [[TMP1]] +; + %retval = alloca <16 x i32> + store <16 x i32> zeroinitializer, ptr %retval + %retval.coerce = alloca + call void @llvm.memcpy.p0.p0.i64(ptr align 16 %retval.coerce, ptr align 16 %retval, i64 64, i1 false) + %1 = load , ptr %retval.coerce + ret %1 +} + +; Test we bail out when using an alloca of a fixed-length vector (VLS) that was +; bitcasted to a scalable vector. +define void @select_load_alloca_to_svdouble_t() vscale_range(1) { +; CHECK-LABEL: @select_load_alloca_to_svdouble_t( +; CHECK-NEXT: [[Z:%.*]] = alloca <16 x half>, align 32 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 0, 0 +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], ptr [[Z]], ptr null +; CHECK-NEXT: [[VAL:%.*]] = load , ptr [[COND]], align 16 +; CHECK-NEXT: ret void +; + %z = alloca <16 x half> + %cmp = icmp eq i32 0, 0 + %cond = select i1 %cmp, ptr %z, ptr null + %val = load , ptr %cond, align 16 + ret void +} + +define void @select_store_alloca_to_svdouble_t( %val) vscale_range(1) { +; CHECK-LABEL: @select_store_alloca_to_svdouble_t( +; CHECK-NEXT: [[Z:%.*]] = alloca <16 x half>, align 32 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 0, 0 +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], ptr [[Z]], ptr null +; CHECK-NEXT: store [[VAL:%.*]], ptr [[COND]], align 16 +; CHECK-NEXT: ret void +; + %z = alloca <16 x half> + %cmp = icmp eq i32 0, 0 + %cond = select i1 %cmp, ptr %z, ptr null + store %val, ptr %cond, align 16 + ret void +} + +define <4 x i32> @fixed_alloca_fixed_from_scalable( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.nxv4i32( [[A:%.*]], i64 0) +; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; + %tmp = alloca <4 x i32> + store %a, ptr %tmp + %cast = load <4 x i32>, ptr %tmp + ret <4 x i32> %cast +} + +define <2 x i8> @fixed_alloca_fixed_from_scalable_requires_bitcast( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_requires_bitcast( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [[A:%.*]] to +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8( [[TMP1]], i64 0) +; CHECK-NEXT: ret <2 x i8> [[TMP2]] +; + %tmp = alloca <2 x i8> + store %a, ptr %tmp + %cast = load <2 x i8>, ptr %tmp + ret <2 x i8> %cast +} + +define <2 x ptr> @fixed_alloca_fixed_from_scalable_inttoptr( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_inttoptr( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [[A:%.*]] to +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64( [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = inttoptr <2 x i64> [[TMP3]] to <2 x ptr> +; CHECK-NEXT: ret <2 x ptr> [[TMP2]] +; + %tmp = alloca <4 x i32> + store %a, ptr %tmp + %cast = load <2 x ptr>, ptr %tmp + ret <2 x ptr> %cast +} + +define <4 x i32> @fixed_alloca_fixed_from_scalable_ptrtoint( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoint( +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint [[A:%.*]] to +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [[TMP1]] to +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.nxv4i32( [[TMP2]], i64 0) +; CHECK-NEXT: ret <4 x i32> [[TMP_0_CAST]] +; + %tmp = alloca <4 x i32> + store %a, ptr %tmp + %cast = load <4 x i32>, ptr %tmp + ret <4 x i32> %cast +} + +define <2 x ptr> @fixed_alloca_fixed_from_scalable_ptrtoptr( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoptr( +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.nxv2p0( [[A:%.*]], i64 0) +; CHECK-NEXT: ret <2 x ptr> [[TMP_0_CAST]] +; + %tmp = alloca <2 x ptr> + store %a, ptr %tmp + %cast = load <2 x ptr>, ptr %tmp + ret <2 x ptr> %cast +} + +define <2 x ptr> @fixed_alloca_fixed_from_scalable_ptrtoptr_different_addrspace( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoptr_different_addrspace( +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint [[A:%.*]] to +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64( [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr <2 x i64> [[TMP2]] to <2 x ptr> +; CHECK-NEXT: ret <2 x ptr> [[TMP3]] +; + %tmp = alloca <2 x ptr> + store %a, ptr %tmp + %cast = load <2 x ptr>, ptr %tmp + ret <2 x ptr> %cast +} + +define @fixed_alloca_scalable_from_fixed(<4 x i32> %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vector.insert.nxv4i32.v4i32( poison, <4 x i32> [[A:%.*]], i64 0) +; CHECK-NEXT: ret [[TMP1]] +; + %tmp = alloca <4 x i32> + store <4 x i32> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_requires_bitcast(<2 x i8> %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_requires_bitcast( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vector.insert.nxv2i8.v2i8( poison, <2 x i8> [[A:%.*]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [[TMP1]] to +; CHECK-NEXT: ret [[TMP2]] +; + %tmp = alloca <2 x i8> + store <2 x i8> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_inttoptr(<4 x i32> %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_inttoptr( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vector.insert.nxv4i32.v4i32( poison, <4 x i32> [[A:%.*]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [[TMP1]] to +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = inttoptr [[TMP2]] to +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca <4 x i32> + store <4 x i32> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_ptrtoint(<2 x ptr> %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoint( +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint <2 x ptr> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.insert.nxv2i64.v2i64( poison, <2 x i64> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = bitcast [[TMP2]] to +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca <4 x i32> + store <2 x ptr> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_ptrtoptr(<2 x ptr> %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoptr( +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = call @llvm.vector.insert.nxv2p0.v2p0( poison, <2 x ptr> [[A:%.*]], i64 0) +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca <2 x ptr> + store <2 x ptr> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_ptrtoptr_different_addrspace(<2 x ptr> %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoptr_different_addrspace( +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint <2 x ptr> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.insert.nxv2i64.v2i64( poison, <2 x i64> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr [[TMP2]] to +; CHECK-NEXT: ret [[TMP3]] +; + %tmp = alloca <2 x ptr> + store <2 x ptr> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define <4 x i32> @scalable_alloca_fixed_from_scalable( %a) vscale_range(1) { +; CHECK-LABEL: @scalable_alloca_fixed_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST:%.*]] = load <4 x i32>, ptr [[TMP]], align 16 +; CHECK-NEXT: ret <4 x i32> [[CAST]] +; + %tmp = alloca + store %a, ptr %tmp + %cast = load <4 x i32>, ptr %tmp + ret <4 x i32> %cast +} + +define @scalable_alloca_scalable_from_fixed(<4 x i32> %a) vscale_range(1) { +; CHECK-LABEL: @scalable_alloca_scalable_from_fixed( +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 16 +; CHECK-NEXT: store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[CAST]] +; + %tmp = alloca + store <4 x i32> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define i16 @scalar_alloca_scalar_from_scalable( %a) vscale_range(1) { +; CHECK-LABEL: @scalar_alloca_scalar_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 2 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load i16, ptr [[TMP]], align 2 +; CHECK-NEXT: ret i16 [[TMP_0_CAST]] +; + %tmp = alloca i16 + store %a, ptr %tmp + %cast = load i16, ptr %tmp + ret i16 %cast +} + +define @scalar_alloca_scalable_from_scalar(i16 %a) vscale_range(1) { +; CHECK-LABEL: @scalar_alloca_scalable_from_scalar( +; CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 +; CHECK-NEXT: store i16 [[A:%.*]], ptr [[TMP]], align 2 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 2 +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca i16 + store i16 %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define { <2 x i32>, <2 x i32> } @fixed_struct_alloca_fixed_from_scalable( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_struct_alloca_fixed_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca { <2 x i32>, <2 x i32> }, align 8 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 8 +; CHECK-NEXT: [[TMP_0_CAST_FCA_0_LOAD:%.*]] = load <2 x i32>, ptr [[TMP]], align 8 +; CHECK-NEXT: [[CAST_FCA_0_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP_0_CAST_FCA_0_LOAD]], 0 +; CHECK-NEXT: [[TMP_8_CAST_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 8 +; CHECK-NEXT: [[TMP_8_CAST_FCA_1_LOAD:%.*]] = load <2 x i32>, ptr [[TMP_8_CAST_FCA_1_GEP_SROA_IDX]], align 8 +; CHECK-NEXT: [[CAST_FCA_1_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[CAST_FCA_0_INSERT]], <2 x i32> [[TMP_8_CAST_FCA_1_LOAD]], 1 +; CHECK-NEXT: ret { <2 x i32>, <2 x i32> } [[CAST_FCA_1_INSERT]] +; + %tmp = alloca { <2 x i32>, <2 x i32> } + store %a, ptr %tmp + %cast = load { <2 x i32>, <2 x i32> }, ptr %tmp + ret { <2 x i32>, <2 x i32> } %cast +} + +define @fixed_struct_alloca_scalable_from_fixed({ <2 x ptr>, <2 x ptr> } %a) vscale_range(1) { +; CHECK-LABEL: @fixed_struct_alloca_scalable_from_fixed( +; CHECK-NEXT: [[TMP:%.*]] = alloca { <2 x ptr>, <2 x ptr> }, align 16 +; CHECK-NEXT: [[A_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A:%.*]], 0 +; CHECK-NEXT: store <2 x ptr> [[A_FCA_0_EXTRACT]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[A_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A]], 1 +; CHECK-NEXT: [[TMP_16_A_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 16 +; CHECK-NEXT: store <2 x ptr> [[A_FCA_1_EXTRACT]], ptr [[TMP_16_A_FCA_1_GEP_SROA_IDX]], align 16 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca { <2 x ptr>, <2 x ptr> } + store { <2 x ptr>, <2 x ptr> } %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-MODIFY-CFG: {{.*}} +; CHECK-PRESERVE-CFG: {{.*}} diff --git a/llvm/test/Transforms/SROA/scalable-vectors.ll b/llvm/test/Transforms/SROA/scalable-vectors.ll index d892883ce9dc..346814d9f630 100644 --- a/llvm/test/Transforms/SROA/scalable-vectors.ll +++ b/llvm/test/Transforms/SROA/scalable-vectors.ll @@ -2,6 +2,8 @@ ; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG ; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" + ; This test checks that SROA runs mem2reg on scalable vectors. define @alloca_nxv16i1( %pg) { @@ -67,11 +69,12 @@ define @cast_alloca_to_svint32_t( %type.coe define @cast_alloca_from_svint32_t() { ; CHECK-LABEL: @cast_alloca_from_svint32_t( ; CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 -; CHECK-NEXT: store <16 x i32> undef, ptr [[RETVAL_COERCE]], align 16 +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr [[RETVAL_COERCE]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load , ptr [[RETVAL_COERCE]], align 16 ; CHECK-NEXT: ret [[TMP1]] ; %retval = alloca <16 x i32> + store <16 x i32> zeroinitializer, ptr %retval %retval.coerce = alloca call void @llvm.memcpy.p0.p0.i64(ptr align 16 %retval.coerce, ptr align 16 %retval, i64 64, i1 false) %1 = load , ptr %retval.coerce @@ -110,6 +113,224 @@ define void @select_store_alloca_to_svdouble_t( %val) { ret void } +define <4 x i32> @fixed_alloca_fixed_from_scalable( %a) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[TMP]], align 16 +; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; + %tmp = alloca <4 x i32> + store %a, ptr %tmp + %cast = load <4 x i32>, ptr %tmp + ret <4 x i32> %cast +} + +define <2 x i8> @fixed_alloca_fixed_from_scalable_requires_bitcast( %a) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_requires_bitcast( +; CHECK-NEXT: [[TMP:%.*]] = alloca <2 x i8>, align 2 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr [[TMP]], align 2 +; CHECK-NEXT: ret <2 x i8> [[TMP2]] +; + %tmp = alloca <2 x i8> + store %a, ptr %tmp + %cast = load <2 x i8>, ptr %tmp + ret <2 x i8> %cast +} + +define <2 x ptr> @fixed_alloca_fixed_from_scalable_inttoptr( %a) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_inttoptr( +; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x ptr>, ptr [[TMP]], align 16 +; CHECK-NEXT: ret <2 x ptr> [[TMP2]] +; + %tmp = alloca <4 x i32> + store %a, ptr %tmp + %cast = load <2 x ptr>, ptr %tmp + ret <2 x ptr> %cast +} + +define <4 x i32> @fixed_alloca_fixed_from_scalable_ptrtoint( %a) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoint( +; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load <4 x i32>, ptr [[TMP]], align 16 +; CHECK-NEXT: ret <4 x i32> [[TMP_0_CAST]] +; + %tmp = alloca <4 x i32> + store %a, ptr %tmp + %cast = load <4 x i32>, ptr %tmp + ret <4 x i32> %cast +} + +define <2 x ptr> @fixed_alloca_fixed_from_scalable_ptrtoptr( %a) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoptr( +; CHECK-NEXT: [[TMP:%.*]] = alloca <2 x ptr>, align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST:%.*]] = load <2 x ptr>, ptr [[TMP]], align 16 +; CHECK-NEXT: ret <2 x ptr> [[CAST]] +; + %tmp = alloca <2 x ptr> + store %a, ptr %tmp + %cast = load <2 x ptr>, ptr %tmp + ret <2 x ptr> %cast +} + +define @fixed_alloca_scalable_from_fixed(<4 x i32> %a) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed( +; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[TMP1]] +; + %tmp = alloca <4 x i32> + store <4 x i32> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_requires_bitcast(<2 x i8> %a) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_requires_bitcast( +; CHECK-NEXT: [[TMP:%.*]] = alloca <2 x i8>, align 2 +; CHECK-NEXT: store <2 x i8> [[A:%.*]], ptr [[TMP]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load , ptr [[TMP]], align 2 +; CHECK-NEXT: ret [[TMP2]] +; + %tmp = alloca <2 x i8> + store <2 x i8> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_inttoptr(<4 x i32> %a) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_inttoptr( +; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca <4 x i32> + store <4 x i32> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_ptrtoint(<2 x ptr> %a) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoint( +; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: store <2 x ptr> [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca <4 x i32> + store <2 x ptr> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_ptrtoptr(<2 x ptr> %a) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoptr( +; CHECK-NEXT: [[TMP:%.*]] = alloca <2 x ptr>, align 16 +; CHECK-NEXT: store <2 x ptr> [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[CAST]] +; + %tmp = alloca <2 x ptr> + store <2 x ptr> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define <4 x i32> @scalable_alloca_fixed_from_scalable( %a) { +; CHECK-LABEL: @scalable_alloca_fixed_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST:%.*]] = load <4 x i32>, ptr [[TMP]], align 16 +; CHECK-NEXT: ret <4 x i32> [[CAST]] +; + %tmp = alloca + store %a, ptr %tmp + %cast = load <4 x i32>, ptr %tmp + ret <4 x i32> %cast +} + +define @scalable_alloca_scalable_from_fixed(<4 x i32> %a) { +; CHECK-LABEL: @scalable_alloca_scalable_from_fixed( +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 16 +; CHECK-NEXT: store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[CAST]] +; + %tmp = alloca + store <4 x i32> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define i16 @scalar_alloca_scalar_from_scalable( %a) { +; CHECK-LABEL: @scalar_alloca_scalar_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 2 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load i16, ptr [[TMP]], align 2 +; CHECK-NEXT: ret i16 [[TMP_0_CAST]] +; + %tmp = alloca i16 + store %a, ptr %tmp + %cast = load i16, ptr %tmp + ret i16 %cast +} + +define @scalar_alloca_scalable_from_scalar(i16 %a) { +; CHECK-LABEL: @scalar_alloca_scalable_from_scalar( +; CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 +; CHECK-NEXT: store i16 [[A:%.*]], ptr [[TMP]], align 2 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 2 +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca i16 + store i16 %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define { <2 x i32>, <2 x i32> } @fixed_struct_alloca_fixed_from_scalable( %a) { +; CHECK-LABEL: @fixed_struct_alloca_fixed_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca { <2 x i32>, <2 x i32> }, align 8 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST_FCA_0_GEP:%.*]] = getelementptr inbounds { <2 x i32>, <2 x i32> }, ptr [[TMP]], i32 0, i32 0 +; CHECK-NEXT: [[TMP_0_CAST_FCA_0_LOAD:%.*]] = load <2 x i32>, ptr [[CAST_FCA_0_GEP]], align 8 +; CHECK-NEXT: [[CAST_FCA_0_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP_0_CAST_FCA_0_LOAD]], 0 +; CHECK-NEXT: [[TMP_8_CAST_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds { <2 x i32>, <2 x i32> }, ptr [[TMP]], i32 0, i32 1 +; CHECK-NEXT: [[TMP_8_CAST_FCA_1_LOAD:%.*]] = load <2 x i32>, ptr [[TMP_8_CAST_FCA_1_GEP_SROA_IDX]], align 8 +; CHECK-NEXT: [[CAST_FCA_1_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[CAST_FCA_0_INSERT]], <2 x i32> [[TMP_8_CAST_FCA_1_LOAD]], 1 +; CHECK-NEXT: ret { <2 x i32>, <2 x i32> } [[CAST_FCA_1_INSERT]] +; + %tmp = alloca { <2 x i32>, <2 x i32> } + store %a, ptr %tmp + %cast = load { <2 x i32>, <2 x i32> }, ptr %tmp + ret { <2 x i32>, <2 x i32> } %cast +} + +define @fixed_struct_alloca_scalable_from_fixed({ <2 x ptr>, <2 x ptr> } %a) { +; CHECK-LABEL: @fixed_struct_alloca_scalable_from_fixed( +; CHECK-NEXT: [[TMP:%.*]] = alloca { <2 x ptr>, <2 x ptr> }, align 16 +; CHECK-NEXT: [[A_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A:%.*]], 0 +; CHECK-NEXT: [[A_FCA_0_GEP:%.*]] = getelementptr inbounds { <2 x ptr>, <2 x ptr> }, ptr [[TMP]], i32 0, i32 0 +; CHECK-NEXT: store <2 x ptr> [[A_FCA_0_EXTRACT]], ptr [[A_FCA_0_GEP]], align 16 +; CHECK-NEXT: [[A_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A]], 1 +; CHECK-NEXT: [[TMP_16_A_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds { <2 x ptr>, <2 x ptr> }, ptr [[TMP]], i32 0, i32 1 +; CHECK-NEXT: store <2 x ptr> [[A_FCA_1_EXTRACT]], ptr [[TMP_16_A_FCA_1_GEP_SROA_IDX]], align 16 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 32 +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca { <2 x ptr>, <2 x ptr> } + store { <2 x ptr>, <2 x ptr> } %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK-MODIFY-CFG: {{.*}}