Skip to content

[ARM] Protect against odd sized vectors in isVTRNMask and friends #153413

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions llvm/lib/Target/ARM/ARMISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7406,7 +7406,7 @@ static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
return false;

unsigned NumElts = VT.getVectorNumElements();
if (M.size() != NumElts && M.size() != NumElts*2)
if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
return false;

// If the mask is twice as long as the input vector then we need to check the
Expand Down Expand Up @@ -7438,7 +7438,7 @@ static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
return false;

unsigned NumElts = VT.getVectorNumElements();
if (M.size() != NumElts && M.size() != NumElts*2)
if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
return false;

for (unsigned i = 0; i < M.size(); i += NumElts) {
Expand Down Expand Up @@ -7541,7 +7541,7 @@ static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
return false;

unsigned NumElts = VT.getVectorNumElements();
if (M.size() != NumElts && M.size() != NumElts*2)
if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
return false;

for (unsigned i = 0; i < M.size(); i += NumElts) {
Expand Down Expand Up @@ -7574,7 +7574,7 @@ static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
return false;

unsigned NumElts = VT.getVectorNumElements();
if (M.size() != NumElts && M.size() != NumElts*2)
if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
return false;

for (unsigned i = 0; i < M.size(); i += NumElts) {
Expand Down
180 changes: 125 additions & 55 deletions llvm/test/CodeGen/ARM/vtrn.ll
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s

define <8 x i8> @vtrni8(ptr %A, ptr %B) nounwind {
Expand All @@ -20,11 +21,11 @@ define <8 x i8> @vtrni8(ptr %A, ptr %B) nounwind {
define <16 x i8> @vtrni8_Qres(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: vtrni8_Qres:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1]
; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0]
; CHECK-NEXT: vtrn.8 [[LDR0]], [[LDR1]]
; CHECK-NEXT: vmov r0, r1, [[LDR0]]
; CHECK-NEXT: vmov r2, r3, [[LDR1]]
; CHECK-NEXT: vldr d16, [r1]
; CHECK-NEXT: vldr d17, [r0]
; CHECK-NEXT: vtrn.8 d17, d16
; CHECK-NEXT: vmov r0, r1, d17
; CHECK-NEXT: vmov r2, r3, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
Expand Down Expand Up @@ -52,11 +53,11 @@ define <4 x i16> @vtrni16(ptr %A, ptr %B) nounwind {
define <8 x i16> @vtrni16_Qres(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: vtrni16_Qres:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1]
; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0]
; CHECK-NEXT: vtrn.16 [[LDR0]], [[LDR1]]
; CHECK-NEXT: vmov r0, r1, [[LDR0]]
; CHECK-NEXT: vmov r2, r3, [[LDR1]]
; CHECK-NEXT: vldr d16, [r1]
; CHECK-NEXT: vldr d17, [r0]
; CHECK-NEXT: vtrn.16 d17, d16
; CHECK-NEXT: vmov r0, r1, d17
; CHECK-NEXT: vmov r2, r3, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
Expand Down Expand Up @@ -84,11 +85,11 @@ define <2 x i32> @vtrni32(ptr %A, ptr %B) nounwind {
define <4 x i32> @vtrni32_Qres(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: vtrni32_Qres:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1]
; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0]
; CHECK-NEXT: vtrn.32 [[LDR0]], [[LDR1]]
; CHECK-NEXT: vmov r0, r1, [[LDR0]]
; CHECK-NEXT: vmov r2, r3, [[LDR1]]
; CHECK-NEXT: vldr d16, [r1]
; CHECK-NEXT: vldr d17, [r0]
; CHECK-NEXT: vtrn.32 d17, d16
; CHECK-NEXT: vmov r0, r1, d17
; CHECK-NEXT: vmov r2, r3, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
Expand Down Expand Up @@ -116,11 +117,11 @@ define <2 x float> @vtrnf(ptr %A, ptr %B) nounwind {
define <4 x float> @vtrnf_Qres(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: vtrnf_Qres:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1]
; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0]
; CHECK-NEXT: vtrn.32 [[LDR0]], [[LDR1]]
; CHECK-NEXT: vmov r0, r1, [[LDR0]]
; CHECK-NEXT: vmov r2, r3, [[LDR1]]
; CHECK-NEXT: vldr d16, [r1]
; CHECK-NEXT: vldr d17, [r0]
; CHECK-NEXT: vtrn.32 d17, d16
; CHECK-NEXT: vmov r0, r1, d17
; CHECK-NEXT: vmov r2, r3, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = load <2 x float>, ptr %A
%tmp2 = load <2 x float>, ptr %B
Expand Down Expand Up @@ -281,11 +282,11 @@ define <8 x i8> @vtrni8_undef(ptr %A, ptr %B) nounwind {
define <16 x i8> @vtrni8_undef_Qres(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: vtrni8_undef_Qres:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1]
; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0]
; CHECK-NEXT: vtrn.8 [[LDR0]], [[LDR1]]
; CHECK-NEXT: vmov r0, r1, [[LDR0]]
; CHECK-NEXT: vmov r2, r3, [[LDR1]]
; CHECK-NEXT: vldr d16, [r1]
; CHECK-NEXT: vldr d17, [r0]
; CHECK-NEXT: vtrn.8 d17, d16
; CHECK-NEXT: vmov r0, r1, d17
; CHECK-NEXT: vmov r2, r3, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
Expand Down Expand Up @@ -327,9 +328,15 @@ define <16 x i16> @vtrnQi16_undef_QQres(ptr %A, ptr %B) nounwind {
}

define <8 x i16> @vtrn_lower_shufflemask_undef(ptr %A, ptr %B) {
; CHECK-LABEL: vtrn_lower_shufflemask_undef:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldr d16, [r1]
; CHECK-NEXT: vldr d17, [r0]
; CHECK-NEXT: vtrn.16 d17, d16
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d16
; CHECK-NEXT: mov pc, lr
entry:
; CHECK-LABEL: vtrn_lower_shufflemask_undef
; CHECK: vtrn
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 5, i32 3, i32 7>
Expand All @@ -340,12 +347,26 @@ entry:
; values do modify the type. However, we get different input types, as some of
; them get truncated from i32 to i8 (from comparing cmp0 with cmp1) and some of
; them get truncated from i16 to i8 (from comparing cmp2 with cmp3).
define <8 x i8> @vtrn_mismatched_builvector0(<8 x i8> %tr0, <8 x i8> %tr1,
<4 x i32> %cmp0, <4 x i32> %cmp1,
<4 x i16> %cmp2, <4 x i16> %cmp3) {
; CHECK-LABEL: vtrn_mismatched_builvector0:
; CHECK: vmovn.i32
; CHECK: vbsl
define <8 x i8> @vtrn_mismatched_builvector0(<8 x i8> %tr0, <8 x i8> %tr1, <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i16> %cmp2, <4 x i16> %cmp3) {
; CHECK-LABEL: vtrn_mismatched_builvector0:
; CHECK: @ %bb.0:
; CHECK-NEXT: mov r12, sp
; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
; CHECK-NEXT: add r12, sp, #16
; CHECK-NEXT: vld1.64 {d18, d19}, [r12]
; CHECK-NEXT: vcgt.u32 q8, q9, q8
; CHECK-NEXT: vldr d20, [sp, #32]
; CHECK-NEXT: vldr d18, [sp, #40]
; CHECK-NEXT: vcgt.u16 d18, d18, d20
; CHECK-NEXT: vmovn.i32 d16, q8
; CHECK-NEXT: vmov d17, r2, r3
; CHECK-NEXT: vtrn.8 d16, d18
; CHECK-NEXT: vmov d18, r0, r1
; CHECK-NEXT: vshl.i8 d16, d16, #7
; CHECK-NEXT: vshr.s8 d16, d16, #7
; CHECK-NEXT: vbsl d16, d18, d17
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%c0 = icmp ult <4 x i32> %cmp0, %cmp1
%c1 = icmp ult <4 x i16> %cmp2, %cmp3
%c = shufflevector <4 x i1> %c0, <4 x i1> %c1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
Expand All @@ -356,12 +377,30 @@ define <8 x i8> @vtrn_mismatched_builvector0(<8 x i8> %tr0, <8 x i8> %tr1,
; Here we get a build_vector node, where half the incoming extract_element
; values do not modify the type (the values form cmp2), but half of them do
; (from the icmp operation).
define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1,
<4 x i32> %cmp0, <4 x i32> %cmp1, ptr %cmp2_ptr) {
; CHECK-LABEL: vtrn_mismatched_builvector1:
; We need to extend the 4 x i8 to 4 x i16 in order to perform the vtrn
; CHECK: vmovl
; CHECK: vbsl
; We need to extend the 4 x i8 to 4 x i16 in order to perform the vtrn
define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1, <4 x i32> %cmp0, <4 x i32> %cmp1, ptr %cmp2_ptr) {
; CHECK-LABEL: vtrn_mismatched_builvector1:
; CHECK: @ %bb.0:
; CHECK-NEXT: .save {r11, lr}
; CHECK-NEXT: push {r11, lr}
; CHECK-NEXT: add r12, sp, #8
; CHECK-NEXT: add lr, sp, #24
; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
; CHECK-NEXT: ldr r12, [sp, #40]
; CHECK-NEXT: vld1.64 {d18, d19}, [lr]
; CHECK-NEXT: vcgt.u32 q8, q9, q8
; CHECK-NEXT: vld1.32 {d18[0]}, [r12:32]
; CHECK-NEXT: vmovl.u8 q9, d18
; CHECK-NEXT: vmovn.i32 d16, q8
; CHECK-NEXT: vmov d17, r2, r3
; CHECK-NEXT: vtrn.8 d16, d18
; CHECK-NEXT: vmov d18, r0, r1
; CHECK-NEXT: vshl.i8 d16, d16, #7
; CHECK-NEXT: vshr.s8 d16, d16, #7
; CHECK-NEXT: vbsl d16, d18, d17
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: pop {r11, lr}
; CHECK-NEXT: mov pc, lr
%cmp2_load = load <4 x i8>, ptr %cmp2_ptr, align 4
%cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
%c0 = icmp ult <4 x i32> %cmp0, %cmp1
Expand All @@ -373,15 +412,15 @@ define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1,
; The shuffle mask is half a vtrn; we duplicate the half to produce the
; full result.
define void @lower_twice_no_vtrn(ptr %A, ptr %B, ptr %C) {
; CHECK-LABEL: lower_twice_no_vtrn:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldr d16, [r1]
; CHECK-NEXT: vldr d18, [r0]
; CHECK-NEXT: vtrn.16 d18, d16
; CHECK-NEXT: vorr d17, d16, d16
; CHECK-NEXT: vst1.64 {d16, d17}, [r2]
; CHECK-NEXT: mov pc, lr
entry:
; CHECK-LABEL: lower_twice_no_vtrn:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d16, [r1]
; CHECK-NEXT: vldr d18, [r0]
; CHECK-NEXT: vtrn.16 d18, d16
; CHECK-NEXT: vorr d17, d16, d16
; CHECK-NEXT: vst1.64 {d16, d17}, [r2]
; CHECK-NEXT: mov pc, lr
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 5, i32 3, i32 7, i32 1, i32 5, i32 3, i32 7>
Expand All @@ -392,18 +431,49 @@ entry:
; The shuffle mask is half a vtrn; we duplicate the half to produce the
; full result.
define void @upper_twice_no_vtrn(ptr %A, ptr %B, ptr %C) {
; CHECK-LABEL: upper_twice_no_vtrn:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldr d16, [r1]
; CHECK-NEXT: vldr d18, [r0]
; CHECK-NEXT: vtrn.16 d18, d16
; CHECK-NEXT: vorr d19, d18, d18
; CHECK-NEXT: vst1.64 {d18, d19}, [r2]
; CHECK-NEXT: mov pc, lr
entry:
; CHECK-LABEL: upper_twice_no_vtrn:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d16, [r1]
; CHECK-NEXT: vldr d18, [r0]
; CHECK-NEXT: vtrn.16 d18, d16
; CHECK-NEXT: vorr d19, d18, d18
; CHECK-NEXT: vst1.64 {d18, d19}, [r2]
; CHECK-NEXT: mov pc, lr
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 6, i32 0, i32 4, i32 2, i32 6>
store <8 x i16> %0, ptr %C
ret void
}

define void @test_15xi16(ptr %next.gep, ptr %next.gep13) {
; CHECK-LABEL: test_15xi16:
; CHECK: @ %bb.0:
; CHECK-NEXT: add r2, r0, #2
; CHECK-NEXT: add r3, r0, #6
; CHECK-NEXT: vld1.16 {d16, d17}, [r2]!
; CHECK-NEXT: vld1.16 {d18}, [r2]!
; CHECK-NEXT: vld1.16 {d20, d21}, [r3]!
; CHECK-NEXT: ldr r2, [r2]
; CHECK-NEXT: vld1.16 {d22}, [r3]!
; CHECK-NEXT: vmov.16 d19[0], r2
; CHECK-NEXT: ldr r3, [r3]
; CHECK-NEXT: add r2, r0, #30
; CHECK-NEXT: add r0, r0, #34
; CHECK-NEXT: vmov.16 d19[1], r3
; CHECK-NEXT: vld1.16 {d19[2]}, [r2:16]
; CHECK-NEXT: vtrn.16 q8, q10
; CHECK-NEXT: vld1.16 {d19[3]}, [r0:16]
; CHECK-NEXT: vtrn.16 d18, d22
; CHECK-NEXT: vst1.16 {d16, d17}, [r1]!
; CHECK-NEXT: vst1.16 {d18, d19}, [r1]
; CHECK-NEXT: mov pc, lr
%a = getelementptr inbounds nuw i8, ptr %next.gep, i32 2
%b = load <15 x i16>, ptr %a, align 2
%c = getelementptr inbounds nuw i8, ptr %next.gep, i32 6
%d = load <15 x i16>, ptr %c, align 2
%interleaved.vec = shufflevector <15 x i16> %b, <15 x i16> %d, <16 x i32> <i32 0, i32 15, i32 2, i32 17, i32 4, i32 19, i32 6, i32 21, i32 8, i32 23, i32 10, i32 25, i32 12, i32 27, i32 14, i32 29>
store <16 x i16> %interleaved.vec, ptr %next.gep13, align 2
ret void
}
56 changes: 56 additions & 0 deletions llvm/test/CodeGen/ARM/vuzp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -535,3 +535,59 @@ define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 {
%.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
ret %struct.uint8x8x2_t %.fca.0.1.insert
}

define void @test_15xi16(ptr %next.gep, ptr %next.gep13) {
; CHECK-LABEL: test_15xi16:
; CHECK: @ %bb.0:
; CHECK-NEXT: .save {r4, r5, r6, lr}
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: add r2, r0, #2
; CHECK-NEXT: add r3, r0, #6
; CHECK-NEXT: vld1.16 {d20, d21}, [r2]!
; CHECK-NEXT: vld1.16 {d16}, [r2]!
; CHECK-NEXT: vmov.u16 r12, d16[0]
; CHECK-NEXT: ldr r2, [r2]
; CHECK-NEXT: vmov.u16 r4, d20[0]
; CHECK-NEXT: vld1.16 {d22, d23}, [r3]!
; CHECK-NEXT: vld1.16 {d24}, [r3]!
; CHECK-NEXT: vmov.u16 lr, d16[2]
; CHECK-NEXT: vmov.u16 r5, d22[0]
; CHECK-NEXT: vmov.u16 r6, d21[0]
; CHECK-NEXT: vmov.16 d17[0], r12
; CHECK-NEXT: vmov.16 d16[0], r4
; CHECK-NEXT: vmov.u16 r4, d24[0]
; CHECK-NEXT: vmov.u16 r12, d24[2]
; CHECK-NEXT: vmov.16 d17[1], lr
; CHECK-NEXT: vmov.16 d18[0], r5
; CHECK-NEXT: vmov.u16 r5, d20[2]
; CHECK-NEXT: vmov.u16 lr, d23[0]
; CHECK-NEXT: vmov.16 d19[0], r4
; CHECK-NEXT: vmov.u16 r4, d22[2]
; CHECK-NEXT: vmov.16 d16[1], r5
; CHECK-NEXT: vmov.u16 r5, d21[2]
; CHECK-NEXT: vmov.16 d17[2], r2
; CHECK-NEXT: ldr r2, [r3]
; CHECK-NEXT: vmov.16 d16[2], r6
; CHECK-NEXT: vmov.16 d18[1], r4
; CHECK-NEXT: vmov.u16 r4, d23[2]
; CHECK-NEXT: vmov.16 d19[1], r12
; CHECK-NEXT: vmov.16 d18[2], lr
; CHECK-NEXT: vmov.16 d19[2], r2
; CHECK-NEXT: add r2, r0, #30
; CHECK-NEXT: add r0, r0, #34
; CHECK-NEXT: vld1.16 {d17[3]}, [r2:16]
; CHECK-NEXT: vmov.16 d16[3], r5
; CHECK-NEXT: vmov.16 d18[3], r4
; CHECK-NEXT: vld1.16 {d19[3]}, [r0:16]
; CHECK-NEXT: vst1.16 {d16, d17}, [r1]!
; CHECK-NEXT: vst1.16 {d18, d19}, [r1]
; CHECK-NEXT: pop {r4, r5, r6, lr}
; CHECK-NEXT: mov pc, lr
%a = getelementptr inbounds nuw i8, ptr %next.gep, i32 2
%b = load <15 x i16>, ptr %a, align 2
%c = getelementptr inbounds nuw i8, ptr %next.gep, i32 6
%d = load <15 x i16>, ptr %c, align 2
%interleaved.vec = shufflevector <15 x i16> %b, <15 x i16> %d, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29>
store <16 x i16> %interleaved.vec, ptr %next.gep13, align 2
ret void
}
19 changes: 19 additions & 0 deletions llvm/test/CodeGen/ARM/vzip.ll
Original file line number Diff line number Diff line change
Expand Up @@ -381,3 +381,22 @@ entry:
%vzip.i = shufflevector <8 x i8> %lane, <8 x i8> %lane3, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
ret <8 x i8> %vzip.i
}

define <16 x i16> @test_15xi16(ptr %next.gep, ptr %next.gep13) {
; CHECK-LABEL: test_15xi16:
; CHECK: @ %bb.0:
; CHECK-NEXT: add r1, r1, #2
; CHECK-NEXT: mov r2, #4
; CHECK-NEXT: vld1.16 {d16, d17}, [r1], r2
; CHECK-NEXT: vld1.16 {d18, d19}, [r1]
; CHECK-NEXT: vzip.16 q8, q9
; CHECK-NEXT: vst1.16 {d16, d17}, [r0:128]!
; CHECK-NEXT: vst1.64 {d18, d19}, [r0:128]
; CHECK-NEXT: mov pc, lr
%a = getelementptr inbounds nuw i8, ptr %next.gep, i32 2
%b = load <15 x i16>, ptr %a, align 2
%c = getelementptr inbounds nuw i8, ptr %next.gep, i32 6
%d = load <15 x i16>, ptr %c, align 2
%interleaved.vec = shufflevector <15 x i16> %b, <15 x i16> %d, <16 x i32> <i32 0, i32 15, i32 1, i32 16, i32 2, i32 17, i32 3, i32 18, i32 4, i32 19, i32 5, i32 20, i32 6, i32 21, i32 7, i32 22>
ret <16 x i16> %interleaved.vec
}