diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index d386c917a256d..ef5ba4fdfd114 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -7406,7 +7406,7 @@ static bool isVTRNMask(ArrayRef M, EVT VT, unsigned &WhichResult) { return false; unsigned NumElts = VT.getVectorNumElements(); - if (M.size() != NumElts && M.size() != NumElts*2) + if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0) return false; // If the mask is twice as long as the input vector then we need to check the @@ -7438,7 +7438,7 @@ static bool isVTRN_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ return false; unsigned NumElts = VT.getVectorNumElements(); - if (M.size() != NumElts && M.size() != NumElts*2) + if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { @@ -7541,7 +7541,7 @@ static bool isVZIPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { return false; unsigned NumElts = VT.getVectorNumElements(); - if (M.size() != NumElts && M.size() != NumElts*2) + if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { @@ -7574,7 +7574,7 @@ static bool isVZIP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ return false; unsigned NumElts = VT.getVectorNumElements(); - if (M.size() != NumElts && M.size() != NumElts*2) + if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { diff --git a/llvm/test/CodeGen/ARM/vtrn.ll b/llvm/test/CodeGen/ARM/vtrn.ll index 136fec3ac3167..63774694f8a9a 100644 --- a/llvm/test/CodeGen/ARM/vtrn.ll +++ b/llvm/test/CodeGen/ARM/vtrn.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s define <8 x i8> @vtrni8(ptr %A, ptr %B) nounwind { @@ -20,11 +21,11 @@ define <8 x i8> @vtrni8(ptr %A, ptr %B) nounwind { define <16 x i8> @vtrni8_Qres(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: vtrni8_Qres: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] -; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] -; CHECK-NEXT: vtrn.8 [[LDR0]], [[LDR1]] -; CHECK-NEXT: vmov r0, r1, [[LDR0]] -; CHECK-NEXT: vmov r2, r3, [[LDR1]] +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtrn.8 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r2, r3, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B @@ -52,11 +53,11 @@ define <4 x i16> @vtrni16(ptr %A, ptr %B) nounwind { define <8 x i16> @vtrni16_Qres(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: vtrni16_Qres: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] -; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] -; CHECK-NEXT: vtrn.16 [[LDR0]], [[LDR1]] -; CHECK-NEXT: vmov r0, r1, [[LDR0]] -; CHECK-NEXT: vmov r2, r3, [[LDR1]] +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtrn.16 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r2, r3, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -84,11 +85,11 @@ define <2 x i32> @vtrni32(ptr %A, ptr %B) nounwind { define <4 x i32> @vtrni32_Qres(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: vtrni32_Qres: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] -; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] -; CHECK-NEXT: vtrn.32 [[LDR0]], [[LDR1]] -; CHECK-NEXT: vmov r0, r1, [[LDR0]] -; CHECK-NEXT: vmov r2, r3, [[LDR1]] +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtrn.32 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r2, r3, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -116,11 +117,11 @@ define <2 x float> @vtrnf(ptr %A, ptr %B) nounwind { define <4 x float> @vtrnf_Qres(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: vtrnf_Qres: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] -; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] -; CHECK-NEXT: vtrn.32 [[LDR0]], [[LDR1]] -; CHECK-NEXT: vmov r0, r1, [[LDR0]] -; CHECK-NEXT: vmov r2, r3, [[LDR1]] +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtrn.32 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r2, r3, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x float>, ptr %A %tmp2 = load <2 x float>, ptr %B @@ -281,11 +282,11 @@ define <8 x i8> @vtrni8_undef(ptr %A, ptr %B) nounwind { define <16 x i8> @vtrni8_undef_Qres(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: vtrni8_undef_Qres: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] -; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] -; CHECK-NEXT: vtrn.8 [[LDR0]], [[LDR1]] -; CHECK-NEXT: vmov r0, r1, [[LDR0]] -; CHECK-NEXT: vmov r2, r3, [[LDR1]] +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtrn.8 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r2, r3, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B @@ -327,9 +328,15 @@ define <16 x i16> @vtrnQi16_undef_QQres(ptr %A, ptr %B) nounwind { } define <8 x i16> @vtrn_lower_shufflemask_undef(ptr %A, ptr %B) { +; CHECK-LABEL: vtrn_lower_shufflemask_undef: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtrn.16 d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d16 +; CHECK-NEXT: mov pc, lr entry: - ; CHECK-LABEL: vtrn_lower_shufflemask_undef - ; CHECK: vtrn %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> @@ -340,12 +347,26 @@ entry: ; values do modify the type. However, we get different input types, as some of ; them get truncated from i32 to i8 (from comparing cmp0 with cmp1) and some of ; them get truncated from i16 to i8 (from comparing cmp2 with cmp3). -define <8 x i8> @vtrn_mismatched_builvector0(<8 x i8> %tr0, <8 x i8> %tr1, - <4 x i32> %cmp0, <4 x i32> %cmp1, - <4 x i16> %cmp2, <4 x i16> %cmp3) { - ; CHECK-LABEL: vtrn_mismatched_builvector0: - ; CHECK: vmovn.i32 - ; CHECK: vbsl +define <8 x i8> @vtrn_mismatched_builvector0(<8 x i8> %tr0, <8 x i8> %tr1, <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i16> %cmp2, <4 x i16> %cmp3) { +; CHECK-LABEL: vtrn_mismatched_builvector0: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: add r12, sp, #16 +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vcgt.u32 q8, q9, q8 +; CHECK-NEXT: vldr d20, [sp, #32] +; CHECK-NEXT: vldr d18, [sp, #40] +; CHECK-NEXT: vcgt.u16 d18, d18, d20 +; CHECK-NEXT: vmovn.i32 d16, q8 +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vtrn.8 d16, d18 +; CHECK-NEXT: vmov d18, r0, r1 +; CHECK-NEXT: vshl.i8 d16, d16, #7 +; CHECK-NEXT: vshr.s8 d16, d16, #7 +; CHECK-NEXT: vbsl d16, d18, d17 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %c0 = icmp ult <4 x i32> %cmp0, %cmp1 %c1 = icmp ult <4 x i16> %cmp2, %cmp3 %c = shufflevector <4 x i1> %c0, <4 x i1> %c1, <8 x i32> @@ -356,12 +377,30 @@ define <8 x i8> @vtrn_mismatched_builvector0(<8 x i8> %tr0, <8 x i8> %tr1, ; Here we get a build_vector node, where half the incoming extract_element ; values do not modify the type (the values form cmp2), but half of them do ; (from the icmp operation). -define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1, - <4 x i32> %cmp0, <4 x i32> %cmp1, ptr %cmp2_ptr) { - ; CHECK-LABEL: vtrn_mismatched_builvector1: - ; We need to extend the 4 x i8 to 4 x i16 in order to perform the vtrn - ; CHECK: vmovl - ; CHECK: vbsl +; We need to extend the 4 x i8 to 4 x i16 in order to perform the vtrn +define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1, <4 x i32> %cmp0, <4 x i32> %cmp1, ptr %cmp2_ptr) { +; CHECK-LABEL: vtrn_mismatched_builvector1: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: add r12, sp, #8 +; CHECK-NEXT: add lr, sp, #24 +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: ldr r12, [sp, #40] +; CHECK-NEXT: vld1.64 {d18, d19}, [lr] +; CHECK-NEXT: vcgt.u32 q8, q9, q8 +; CHECK-NEXT: vld1.32 {d18[0]}, [r12:32] +; CHECK-NEXT: vmovl.u8 q9, d18 +; CHECK-NEXT: vmovn.i32 d16, q8 +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vtrn.8 d16, d18 +; CHECK-NEXT: vmov d18, r0, r1 +; CHECK-NEXT: vshl.i8 d16, d16, #7 +; CHECK-NEXT: vshr.s8 d16, d16, #7 +; CHECK-NEXT: vbsl d16, d18, d17 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: mov pc, lr %cmp2_load = load <4 x i8>, ptr %cmp2_ptr, align 4 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> %c0 = icmp ult <4 x i32> %cmp0, %cmp1 @@ -373,15 +412,15 @@ define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1, ; The shuffle mask is half a vtrn; we duplicate the half to produce the ; full result. define void @lower_twice_no_vtrn(ptr %A, ptr %B, ptr %C) { +; CHECK-LABEL: lower_twice_no_vtrn: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d18, [r0] +; CHECK-NEXT: vtrn.16 d18, d16 +; CHECK-NEXT: vorr d17, d16, d16 +; CHECK-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-NEXT: mov pc, lr entry: - ; CHECK-LABEL: lower_twice_no_vtrn: - ; CHECK: @ %bb.0: - ; CHECK-NEXT: vldr d16, [r1] - ; CHECK-NEXT: vldr d18, [r0] - ; CHECK-NEXT: vtrn.16 d18, d16 - ; CHECK-NEXT: vorr d17, d16, d16 - ; CHECK-NEXT: vst1.64 {d16, d17}, [r2] - ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> @@ -392,18 +431,49 @@ entry: ; The shuffle mask is half a vtrn; we duplicate the half to produce the ; full result. define void @upper_twice_no_vtrn(ptr %A, ptr %B, ptr %C) { +; CHECK-LABEL: upper_twice_no_vtrn: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d18, [r0] +; CHECK-NEXT: vtrn.16 d18, d16 +; CHECK-NEXT: vorr d19, d18, d18 +; CHECK-NEXT: vst1.64 {d18, d19}, [r2] +; CHECK-NEXT: mov pc, lr entry: - ; CHECK-LABEL: upper_twice_no_vtrn: - ; CHECK: @ %bb.0: - ; CHECK-NEXT: vldr d16, [r1] - ; CHECK-NEXT: vldr d18, [r0] - ; CHECK-NEXT: vtrn.16 d18, d16 - ; CHECK-NEXT: vorr d19, d18, d18 - ; CHECK-NEXT: vst1.64 {d18, d19}, [r2] - ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> store <8 x i16> %0, ptr %C ret void } + +define void @test_15xi16(ptr %next.gep, ptr %next.gep13) { +; CHECK-LABEL: test_15xi16: +; CHECK: @ %bb.0: +; CHECK-NEXT: add r2, r0, #2 +; CHECK-NEXT: add r3, r0, #6 +; CHECK-NEXT: vld1.16 {d16, d17}, [r2]! +; CHECK-NEXT: vld1.16 {d18}, [r2]! +; CHECK-NEXT: vld1.16 {d20, d21}, [r3]! +; CHECK-NEXT: ldr r2, [r2] +; CHECK-NEXT: vld1.16 {d22}, [r3]! +; CHECK-NEXT: vmov.16 d19[0], r2 +; CHECK-NEXT: ldr r3, [r3] +; CHECK-NEXT: add r2, r0, #30 +; CHECK-NEXT: add r0, r0, #34 +; CHECK-NEXT: vmov.16 d19[1], r3 +; CHECK-NEXT: vld1.16 {d19[2]}, [r2:16] +; CHECK-NEXT: vtrn.16 q8, q10 +; CHECK-NEXT: vld1.16 {d19[3]}, [r0:16] +; CHECK-NEXT: vtrn.16 d18, d22 +; CHECK-NEXT: vst1.16 {d16, d17}, [r1]! +; CHECK-NEXT: vst1.16 {d18, d19}, [r1] +; CHECK-NEXT: mov pc, lr + %a = getelementptr inbounds nuw i8, ptr %next.gep, i32 2 + %b = load <15 x i16>, ptr %a, align 2 + %c = getelementptr inbounds nuw i8, ptr %next.gep, i32 6 + %d = load <15 x i16>, ptr %c, align 2 + %interleaved.vec = shufflevector <15 x i16> %b, <15 x i16> %d, <16 x i32> + store <16 x i16> %interleaved.vec, ptr %next.gep13, align 2 + ret void +} diff --git a/llvm/test/CodeGen/ARM/vuzp.ll b/llvm/test/CodeGen/ARM/vuzp.ll index 7e1dfba34db2e..d24dadc7fc401 100644 --- a/llvm/test/CodeGen/ARM/vuzp.ll +++ b/llvm/test/CodeGen/ARM/vuzp.ll @@ -535,3 +535,59 @@ define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 { %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1 ret %struct.uint8x8x2_t %.fca.0.1.insert } + +define void @test_15xi16(ptr %next.gep, ptr %next.gep13) { +; CHECK-LABEL: test_15xi16: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: add r2, r0, #2 +; CHECK-NEXT: add r3, r0, #6 +; CHECK-NEXT: vld1.16 {d20, d21}, [r2]! +; CHECK-NEXT: vld1.16 {d16}, [r2]! +; CHECK-NEXT: vmov.u16 r12, d16[0] +; CHECK-NEXT: ldr r2, [r2] +; CHECK-NEXT: vmov.u16 r4, d20[0] +; CHECK-NEXT: vld1.16 {d22, d23}, [r3]! +; CHECK-NEXT: vld1.16 {d24}, [r3]! +; CHECK-NEXT: vmov.u16 lr, d16[2] +; CHECK-NEXT: vmov.u16 r5, d22[0] +; CHECK-NEXT: vmov.u16 r6, d21[0] +; CHECK-NEXT: vmov.16 d17[0], r12 +; CHECK-NEXT: vmov.16 d16[0], r4 +; CHECK-NEXT: vmov.u16 r4, d24[0] +; CHECK-NEXT: vmov.u16 r12, d24[2] +; CHECK-NEXT: vmov.16 d17[1], lr +; CHECK-NEXT: vmov.16 d18[0], r5 +; CHECK-NEXT: vmov.u16 r5, d20[2] +; CHECK-NEXT: vmov.u16 lr, d23[0] +; CHECK-NEXT: vmov.16 d19[0], r4 +; CHECK-NEXT: vmov.u16 r4, d22[2] +; CHECK-NEXT: vmov.16 d16[1], r5 +; CHECK-NEXT: vmov.u16 r5, d21[2] +; CHECK-NEXT: vmov.16 d17[2], r2 +; CHECK-NEXT: ldr r2, [r3] +; CHECK-NEXT: vmov.16 d16[2], r6 +; CHECK-NEXT: vmov.16 d18[1], r4 +; CHECK-NEXT: vmov.u16 r4, d23[2] +; CHECK-NEXT: vmov.16 d19[1], r12 +; CHECK-NEXT: vmov.16 d18[2], lr +; CHECK-NEXT: vmov.16 d19[2], r2 +; CHECK-NEXT: add r2, r0, #30 +; CHECK-NEXT: add r0, r0, #34 +; CHECK-NEXT: vld1.16 {d17[3]}, [r2:16] +; CHECK-NEXT: vmov.16 d16[3], r5 +; CHECK-NEXT: vmov.16 d18[3], r4 +; CHECK-NEXT: vld1.16 {d19[3]}, [r0:16] +; CHECK-NEXT: vst1.16 {d16, d17}, [r1]! +; CHECK-NEXT: vst1.16 {d18, d19}, [r1] +; CHECK-NEXT: pop {r4, r5, r6, lr} +; CHECK-NEXT: mov pc, lr + %a = getelementptr inbounds nuw i8, ptr %next.gep, i32 2 + %b = load <15 x i16>, ptr %a, align 2 + %c = getelementptr inbounds nuw i8, ptr %next.gep, i32 6 + %d = load <15 x i16>, ptr %c, align 2 + %interleaved.vec = shufflevector <15 x i16> %b, <15 x i16> %d, <16 x i32> + store <16 x i16> %interleaved.vec, ptr %next.gep13, align 2 + ret void +} diff --git a/llvm/test/CodeGen/ARM/vzip.ll b/llvm/test/CodeGen/ARM/vzip.ll index dda774abd8516..ce40a2e48b6e8 100644 --- a/llvm/test/CodeGen/ARM/vzip.ll +++ b/llvm/test/CodeGen/ARM/vzip.ll @@ -381,3 +381,22 @@ entry: %vzip.i = shufflevector <8 x i8> %lane, <8 x i8> %lane3, <8 x i32> ret <8 x i8> %vzip.i } + +define <16 x i16> @test_15xi16(ptr %next.gep, ptr %next.gep13) { +; CHECK-LABEL: test_15xi16: +; CHECK: @ %bb.0: +; CHECK-NEXT: add r1, r1, #2 +; CHECK-NEXT: mov r2, #4 +; CHECK-NEXT: vld1.16 {d16, d17}, [r1], r2 +; CHECK-NEXT: vld1.16 {d18, d19}, [r1] +; CHECK-NEXT: vzip.16 q8, q9 +; CHECK-NEXT: vst1.16 {d16, d17}, [r0:128]! +; CHECK-NEXT: vst1.64 {d18, d19}, [r0:128] +; CHECK-NEXT: mov pc, lr + %a = getelementptr inbounds nuw i8, ptr %next.gep, i32 2 + %b = load <15 x i16>, ptr %a, align 2 + %c = getelementptr inbounds nuw i8, ptr %next.gep, i32 6 + %d = load <15 x i16>, ptr %c, align 2 + %interleaved.vec = shufflevector <15 x i16> %b, <15 x i16> %d, <16 x i32> + ret <16 x i16> %interleaved.vec +}