You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
[AArch64] Improve lowering for scalable masked interleaving stores
Similar to #154338, this PR aims to support lowering of certain IR
to SVE's st2 and st4 instructions. The typical IR scenario looks
like:
%mask = .. @llvm.vector.interleave2(<vscale x 16 x i1> %m, <vscale x 16 x i1> %m)
%val = .. @llvm.vector.interleave2(<vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2)
.. @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> %val, ..., <vscale x 32 x i1> %mask)
where we're interleaving both the value and the mask being passed
to the wide store. When the mask interleave parts are identical
we can lower this to st2b.
This PR adds a DAG combine for lowering this kind of IR pattern
to st2X and st4X SVE instructions.
%interleaved.mask = tailcall <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
63
51
%interleaved.value = tailcall <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %val1, <vscale x 2 x i64> %val2)
@@ -68,24 +56,11 @@ define void @foo_st2_nxv2i64(<vscale x 2 x i1> %mask, <vscale x 2 x i64> %val1,
68
56
definevoid@foo_st4_nxv16i8(<vscale x 16 x i1> %mask, <vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2, <vscale x 16 x i8> %val3, <vscale x 16 x i8> %val4, ptr%p) {
%interleaved.mask = tailcall <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
91
66
%interleaved.value = tailcall <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2, <vscale x 16 x i8> %val3, <vscale x 16 x i8> %val4)
@@ -96,24 +71,11 @@ define void @foo_st4_nxv16i8(<vscale x 16 x i1> %mask, <vscale x 16 x i8> %val1,
96
71
definevoid@foo_st4_nxv8i16(<vscale x 8 x i1> %mask, <vscale x 8 x i16> %val1, <vscale x 8 x i16> %val2, <vscale x 8 x i16> %val3, <vscale x 8 x i16> %val4, ptr%p) {
%interleaved.mask = tailcall <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
119
81
%interleaved.value = tailcall <vscale x 32 x i16> @llvm.vector.interleave4.nxv32i16(<vscale x 8 x i16> %val1, <vscale x 8 x i16> %val2, <vscale x 8 x i16> %val3, <vscale x 8 x i16> %val4)
@@ -124,24 +86,11 @@ define void @foo_st4_nxv8i16(<vscale x 8 x i1> %mask, <vscale x 8 x i16> %val1,
124
86
definevoid@foo_st4_nxv4i32(<vscale x 4 x i1> %mask, <vscale x 4 x i32> %val1, <vscale x 4 x i32> %val2, <vscale x 4 x i32> %val3, <vscale x 4 x i32> %val4, ptr%p) {
%interleaved.mask = tailcall <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask)
147
96
%interleaved.value = tailcall <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> %val1, <vscale x 4 x i32> %val2, <vscale x 4 x i32> %val3, <vscale x 4 x i32> %val4)
@@ -152,24 +101,11 @@ define void @foo_st4_nxv4i32(<vscale x 4 x i1> %mask, <vscale x 4 x i32> %val1,
152
101
definevoid@foo_st4_nxv2i64(<vscale x 2 x i1> %mask, <vscale x 2 x i64> %val1, <vscale x 2 x i64> %val2, <vscale x 2 x i64> %val3, <vscale x 2 x i64> %val4, ptr%p) {
%interleaved.mask = tailcall <vscale x 8 x i1> @llvm.vector.interleave4.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
175
111
%interleaved.value = tailcall <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> %val1, <vscale x 2 x i64> %val2, <vscale x 2 x i64> %val3, <vscale x 2 x i64> %val4)
@@ -181,13 +117,12 @@ define void @foo_st2_nxv16i8_mul_use_mask(<vscale x 16 x i1> %mask, <vscale x 16
0 commit comments