diff --git a/llvm/lib/CodeGen/SlotIndexes.cpp b/llvm/lib/CodeGen/SlotIndexes.cpp index 1b92a5aa59d18..eecc379776ea0 100644 --- a/llvm/lib/CodeGen/SlotIndexes.cpp +++ b/llvm/lib/CodeGen/SlotIndexes.cpp @@ -212,6 +212,7 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB, IndexList::iterator ListI = endIdx.listEntry()->getIterator(); MachineBasicBlock::iterator MBBI = End; bool pastStart = false; + bool OldIndexesRemoved = false; while (ListI != ListB || MBBI != Begin || (includeStart && !pastStart)) { assert(ListI->getIndex() >= startIdx.getIndex() && (includeStart || !pastStart) && @@ -220,6 +221,8 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB, MachineInstr *SlotMI = ListI->getInstr(); MachineInstr *MI = (MBBI != MBB->end() && !pastStart) ? &*MBBI : nullptr; bool MBBIAtBegin = MBBI == Begin && (!includeStart || pastStart); + bool MIIndexNotFound = MI && !mi2iMap.contains(MI); + bool SlotMIRemoved = false; if (SlotMI == MI && !MBBIAtBegin) { --ListI; @@ -227,25 +230,31 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB, --MBBI; else pastStart = true; - } else if (MI && !mi2iMap.contains(MI)) { + } else if (MIIndexNotFound || OldIndexesRemoved) { if (MBBI != Begin) --MBBI; else pastStart = true; } else { - --ListI; - if (SlotMI) + // We ran through all the indexes on the interval + // -> The only thing left is to go through all the + // remaining MBB instructions and update their indexes + if (ListI == ListB) + OldIndexesRemoved = true; + else + --ListI; + if (SlotMI) { removeMachineInstrFromMaps(*SlotMI); + SlotMIRemoved = true; + } } - } - // In theory this could be combined with the previous loop, but it is tricky - // to update the IndexList while we are iterating it. - for (MachineBasicBlock::iterator I = End; I != Begin;) { - --I; - MachineInstr &MI = *I; - if (!MI.isDebugOrPseudoInstr() && !mi2iMap.contains(&MI)) - insertMachineInstrInMaps(MI); + MachineInstr *InstrToInsert = SlotMIRemoved ? SlotMI : MI; + + // Insert instruction back into the maps after passing it/removing the index + if ((MIIndexNotFound || SlotMIRemoved) && InstrToInsert->getParent() && + !InstrToInsert->isDebugOrPseudoInstr()) + insertMachineInstrInMaps(*InstrToInsert); } } diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll index feda774ab0e65..82c8d50e518b0 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -223,18 +223,31 @@ entry: } define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) { -; CHECK-LABEL: shuffle3_i16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s5, s3 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vins.f16 s1, s0 -; CHECK-NEXT: vins.f16 s6, s4 -; CHECK-NEXT: vins.f16 s5, s3 -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s7, s1 -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: shuffle3_i16: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: vmovx.f16 s5, s3 +; CHECK-LV-NEXT: vmovx.f16 s6, s1 +; CHECK-LV-NEXT: vmovx.f16 s4, s0 +; CHECK-LV-NEXT: vins.f16 s1, s0 +; CHECK-LV-NEXT: vins.f16 s6, s4 +; CHECK-LV-NEXT: vins.f16 s5, s3 +; CHECK-LV-NEXT: vmov.f32 s4, s2 +; CHECK-LV-NEXT: vmov.f32 s7, s1 +; CHECK-LV-NEXT: vmov q0, q1 +; CHECK-LV-NEXT: bx lr + +; CHECK-LIS-LABEL: shuffle3_i16: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: vmov q1, q0 +; CHECK-LIS-NEXT: vmovx.f16 s2, s5 +; CHECK-LIS-NEXT: vmovx.f16 s0, s4 +; CHECK-LIS-NEXT: vins.f16 s5, s4 +; CHECK-LIS-NEXT: vins.f16 s2, s0 +; CHECK-LIS-NEXT: vmov.f32 s0, s6 +; CHECK-LIS-NEXT: vmovx.f16 s1, s7 +; CHECK-LIS-NEXT: vmov.f32 s3, s5 +; CHECK-LIS-NEXT: vins.f16 s1, s7 +; CHECK-LIS-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> ret <8 x i16> %out @@ -1145,18 +1158,31 @@ entry: } define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) { -; CHECK-LABEL: shuffle3_f16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s5, s3 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vins.f16 s1, s0 -; CHECK-NEXT: vins.f16 s6, s4 -; CHECK-NEXT: vins.f16 s5, s3 -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s7, s1 -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: shuffle3_f16: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: vmovx.f16 s5, s3 +; CHECK-LV-NEXT: vmovx.f16 s6, s1 +; CHECK-LV-NEXT: vmovx.f16 s4, s0 +; CHECK-LV-NEXT: vins.f16 s1, s0 +; CHECK-LV-NEXT: vins.f16 s6, s4 +; CHECK-LV-NEXT: vins.f16 s5, s3 +; CHECK-LV-NEXT: vmov.f32 s4, s2 +; CHECK-LV-NEXT: vmov.f32 s7, s1 +; CHECK-LV-NEXT: vmov q0, q1 +; CHECK-LV-NEXT: bx lr + +; CHECK-LIS-LABEL: shuffle3_f16: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: vmov q1, q0 +; CHECK-LIS-NEXT: vmovx.f16 s2, s5 +; CHECK-LIS-NEXT: vmovx.f16 s0, s4 +; CHECK-LIS-NEXT: vins.f16 s5, s4 +; CHECK-LIS-NEXT: vins.f16 s2, s0 +; CHECK-LIS-NEXT: vmov.f32 s0, s6 +; CHECK-LIS-NEXT: vmovx.f16 s1, s7 +; CHECK-LIS-NEXT: vmov.f32 s3, s5 +; CHECK-LIS-NEXT: vins.f16 s1, s7 +; CHECK-LIS-NEXT: bx lr entry: %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> ret <8 x half> %out @@ -1467,27 +1493,47 @@ entry: ret <2 x double> %out } define arm_aapcs_vfpcc <8 x double> @shuffle9_f64(<4 x double> %src1, <4 x double> %src2) { -; CHECK-LABEL: shuffle9_f64: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov q5, q2 -; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: vmov.f32 s18, s20 -; CHECK-NEXT: vmov.f32 s20, s2 -; CHECK-NEXT: vmov.f32 s10, s12 -; CHECK-NEXT: vmov.f32 s19, s21 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s17, s1 -; CHECK-NEXT: vmov.f32 s21, s3 -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s11, s13 -; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov q1, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: shuffle9_f64: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LV-NEXT: vmov q5, q2 +; CHECK-LV-NEXT: vmov.f32 s16, s0 +; CHECK-LV-NEXT: vmov.f32 s18, s20 +; CHECK-LV-NEXT: vmov.f32 s20, s2 +; CHECK-LV-NEXT: vmov.f32 s10, s12 +; CHECK-LV-NEXT: vmov.f32 s19, s21 +; CHECK-LV-NEXT: vmov.f32 s8, s4 +; CHECK-LV-NEXT: vmov.f32 s17, s1 +; CHECK-LV-NEXT: vmov.f32 s21, s3 +; CHECK-LV-NEXT: vmov q0, q4 +; CHECK-LV-NEXT: vmov.f32 s12, s6 +; CHECK-LV-NEXT: vmov.f32 s11, s13 +; CHECK-LV-NEXT: vmov.f32 s9, s5 +; CHECK-LV-NEXT: vmov.f32 s13, s7 +; CHECK-LV-NEXT: vmov q1, q5 +; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LV-NEXT: bx lr + +; CHECK-LIS-LABEL: shuffle9_f64: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vmov q5, q2 +; CHECK-LIS-NEXT: vmov q4, q0 +; CHECK-LIS-NEXT: vmov.f32 s2, s20 +; CHECK-LIS-NEXT: vmov.f32 s20, s18 +; CHECK-LIS-NEXT: vmov.f32 s10, s12 +; CHECK-LIS-NEXT: vmov.f32 s3, s21 +; CHECK-LIS-NEXT: vmov.f32 s8, s4 +; CHECK-LIS-NEXT: vmov.f32 s21, s19 +; CHECK-LIS-NEXT: vmov.f32 s12, s6 +; CHECK-LIS-NEXT: vmov.f32 s11, s13 +; CHECK-LIS-NEXT: vmov.f32 s9, s5 +; CHECK-LIS-NEXT: vmov.f32 s13, s7 +; CHECK-LIS-NEXT: vmov q1, q5 +; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LIS-NEXT: bx lr entry: %out = shufflevector <4 x double> %src1, <4 x double> %src2, <8 x i32> ret <8 x double> %out @@ -1560,27 +1606,47 @@ entry: ret <2 x i64> %out } define arm_aapcs_vfpcc <8 x i64> @shuffle9_i64(<4 x i64> %src1, <4 x i64> %src2) { -; CHECK-LABEL: shuffle9_i64: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov q5, q2 -; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: vmov.f32 s18, s20 -; CHECK-NEXT: vmov.f32 s20, s2 -; CHECK-NEXT: vmov.f32 s10, s12 -; CHECK-NEXT: vmov.f32 s19, s21 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s17, s1 -; CHECK-NEXT: vmov.f32 s21, s3 -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s11, s13 -; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov q1, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: shuffle9_i64: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LV-NEXT: vmov q5, q2 +; CHECK-LV-NEXT: vmov.f32 s16, s0 +; CHECK-LV-NEXT: vmov.f32 s18, s20 +; CHECK-LV-NEXT: vmov.f32 s20, s2 +; CHECK-LV-NEXT: vmov.f32 s10, s12 +; CHECK-LV-NEXT: vmov.f32 s19, s21 +; CHECK-LV-NEXT: vmov.f32 s8, s4 +; CHECK-LV-NEXT: vmov.f32 s17, s1 +; CHECK-LV-NEXT: vmov.f32 s21, s3 +; CHECK-LV-NEXT: vmov q0, q4 +; CHECK-LV-NEXT: vmov.f32 s12, s6 +; CHECK-LV-NEXT: vmov.f32 s11, s13 +; CHECK-LV-NEXT: vmov.f32 s9, s5 +; CHECK-LV-NEXT: vmov.f32 s13, s7 +; CHECK-LV-NEXT: vmov q1, q5 +; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LV-NEXT: bx lr + +; CHECK-LIS-LABEL: shuffle9_i64: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vmov q5, q2 +; CHECK-LIS-NEXT: vmov q4, q0 +; CHECK-LIS-NEXT: vmov.f32 s2, s20 +; CHECK-LIS-NEXT: vmov.f32 s20, s18 +; CHECK-LIS-NEXT: vmov.f32 s10, s12 +; CHECK-LIS-NEXT: vmov.f32 s3, s21 +; CHECK-LIS-NEXT: vmov.f32 s8, s4 +; CHECK-LIS-NEXT: vmov.f32 s21, s19 +; CHECK-LIS-NEXT: vmov.f32 s12, s6 +; CHECK-LIS-NEXT: vmov.f32 s11, s13 +; CHECK-LIS-NEXT: vmov.f32 s9, s5 +; CHECK-LIS-NEXT: vmov.f32 s13, s7 +; CHECK-LIS-NEXT: vmov q1, q5 +; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LIS-NEXT: bx lr entry: %out = shufflevector <4 x i64> %src1, <4 x i64> %src2, <8 x i32> ret <8 x i64> %out diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll index 0c58abaa1c86e..b6c8056891f83 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -33,29 +33,53 @@ entry: } define void @vld3_v4i32(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v4i32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s15, s18 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vadd.i32 q0, q2, q0 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: vld3_v4i32: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9} +; CHECK-LV-NEXT: vpush {d8, d9} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-LV-NEXT: vldrw.u32 q1, [r0] +; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-LV-NEXT: vmov.f32 s10, s2 +; CHECK-LV-NEXT: vmov.f32 s13, s0 +; CHECK-LV-NEXT: vmov.f32 s14, s3 +; CHECK-LV-NEXT: vmov.f32 s8, s4 +; CHECK-LV-NEXT: vmov.f32 s9, s7 +; CHECK-LV-NEXT: vmov.f32 s12, s5 +; CHECK-LV-NEXT: vmov.f32 s15, s18 +; CHECK-LV-NEXT: vmov.f32 s11, s17 +; CHECK-LV-NEXT: vadd.i32 q2, q2, q3 +; CHECK-LV-NEXT: vmov.f32 s0, s6 +; CHECK-LV-NEXT: vmov.f32 s2, s16 +; CHECK-LV-NEXT: vmov.f32 s3, s19 +; CHECK-LV-NEXT: vadd.i32 q0, q2, q0 +; CHECK-LV-NEXT: vstrw.32 q0, [r1] +; CHECK-LV-NEXT: vpop {d8, d9} +; CHECK-LV-NEXT: bx lr + +; CHECK-LIS-LABEL: vld3_v4i32: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9} +; CHECK-LIS-NEXT: vpush {d8, d9} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0] +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LIS-NEXT: vmov.f32 s10, s2 +; CHECK-LIS-NEXT: vmov.f32 s17, s0 +; CHECK-LIS-NEXT: vmov.f32 s18, s3 +; CHECK-LIS-NEXT: vmov.f32 s8, s4 +; CHECK-LIS-NEXT: vmov.f32 s9, s7 +; CHECK-LIS-NEXT: vmov.f32 s16, s5 +; CHECK-LIS-NEXT: vmov.f32 s19, s14 +; CHECK-LIS-NEXT: vmov.f32 s11, s13 +; CHECK-LIS-NEXT: vadd.i32 q2, q2, q4 +; CHECK-LIS-NEXT: vmov.f32 s0, s6 +; CHECK-LIS-NEXT: vmov.f32 s2, s12 +; CHECK-LIS-NEXT: vmov.f32 s3, s15 +; CHECK-LIS-NEXT: vadd.i32 q0, q2, q0 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1] +; CHECK-LIS-NEXT: vpop {d8, d9} +; CHECK-LIS-NEXT: bx lr entry: %l1 = load <12 x i32>, ptr %src, align 4 %s1 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> @@ -68,46 +92,87 @@ entry: } define void @vld3_v8i32(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v8i32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s15, s18 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vadd.i32 q0, q2, q0 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vmov.f32 s20, s8 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s7, s15 -; CHECK-NEXT: vadd.i32 q1, q4, q1 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: vld3_v8i32: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-LV-NEXT: vmov.f32 s10, s2 +; CHECK-LV-NEXT: vmov.f32 s13, s0 +; CHECK-LV-NEXT: vmov.f32 s14, s3 +; CHECK-LV-NEXT: vmov.f32 s8, s4 +; CHECK-LV-NEXT: vmov.f32 s9, s7 +; CHECK-LV-NEXT: vmov.f32 s12, s5 +; CHECK-LV-NEXT: vmov.f32 s15, s18 +; CHECK-LV-NEXT: vmov.f32 s11, s17 +; CHECK-LV-NEXT: vadd.i32 q2, q2, q3 +; CHECK-LV-NEXT: vmov.f32 s0, s6 +; CHECK-LV-NEXT: vmov.f32 s2, s16 +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LV-NEXT: vmov.f32 s3, s19 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LV-NEXT: vadd.i32 q0, q2, q0 +; CHECK-LV-NEXT: vldrw.u32 q2, [r0] +; CHECK-LV-NEXT: vmov.f32 s17, s4 +; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-LV-NEXT: vmov.f32 s18, s7 +; CHECK-LV-NEXT: vmov.f32 s22, s6 +; CHECK-LV-NEXT: vmov.f32 s16, s9 +; CHECK-LV-NEXT: vmov.f32 s19, s14 +; CHECK-LV-NEXT: vmov.f32 s20, s8 +; CHECK-LV-NEXT: vmov.f32 s21, s11 +; CHECK-LV-NEXT: vmov.f32 s23, s13 +; CHECK-LV-NEXT: vadd.i32 q4, q5, q4 +; CHECK-LV-NEXT: vmov.f32 s4, s10 +; CHECK-LV-NEXT: vmov.f32 s6, s12 +; CHECK-LV-NEXT: vmov.f32 s7, s15 +; CHECK-LV-NEXT: vadd.i32 q1, q4, q1 +; CHECK-LV-NEXT: vstrw.32 q1, [r1] +; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LV-NEXT: bx lr + +; CHECK-LIS-LABEL: vld3_v8i32: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-LIS-NEXT: vmov.f32 s10, s2 +; CHECK-LIS-NEXT: vmov.f32 s17, s0 +; CHECK-LIS-NEXT: vmov.f32 s18, s3 +; CHECK-LIS-NEXT: vmov.f32 s8, s4 +; CHECK-LIS-NEXT: vmov.f32 s9, s7 +; CHECK-LIS-NEXT: vmov.f32 s16, s5 +; CHECK-LIS-NEXT: vmov.f32 s19, s14 +; CHECK-LIS-NEXT: vmov.f32 s11, s13 +; CHECK-LIS-NEXT: vmov.f32 s0, s6 +; CHECK-LIS-NEXT: vadd.i32 q2, q2, q4 +; CHECK-LIS-NEXT: vmov.f32 s2, s12 +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LIS-NEXT: vmov.f32 s3, s15 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LIS-NEXT: vadd.i32 q0, q2, q0 +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0] +; CHECK-LIS-NEXT: vmov.f32 s17, s4 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-LIS-NEXT: vmov.f32 s18, s7 +; CHECK-LIS-NEXT: vmov.f32 s22, s6 +; CHECK-LIS-NEXT: vmov.f32 s16, s9 +; CHECK-LIS-NEXT: vmov.f32 s19, s14 +; CHECK-LIS-NEXT: vmov.f32 s20, s8 +; CHECK-LIS-NEXT: vmov.f32 s21, s11 +; CHECK-LIS-NEXT: vmov.f32 s23, s13 +; CHECK-LIS-NEXT: vadd.i32 q4, q5, q4 +; CHECK-LIS-NEXT: vmov.f32 s4, s10 +; CHECK-LIS-NEXT: vmov.f32 s6, s12 +; CHECK-LIS-NEXT: vmov.f32 s7, s15 +; CHECK-LIS-NEXT: vadd.i32 q1, q4, q1 +; CHECK-LIS-NEXT: vstrw.32 q1, [r1] +; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LIS-NEXT: bx lr entry: %l1 = load <24 x i32>, ptr %src, align 4 %s1 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> @@ -120,80 +185,155 @@ entry: } define void @vld3_v16i32(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v16i32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s15, s18 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vadd.i32 q0, q2, q0 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vmov.f32 s20, s8 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s7, s15 -; CHECK-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-NEXT: vadd.i32 q1, q4, q1 -; CHECK-NEXT: vmov.f32 s18, s10 -; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vmov.f32 s22, s11 -; CHECK-NEXT: vmov.f32 s16, s12 -; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vmov.f32 s23, s26 -; CHECK-NEXT: vmov.f32 s19, s25 -; CHECK-NEXT: vadd.i32 q4, q4, q5 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vmov.f32 s10, s24 -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vmov.f32 s11, s27 -; CHECK-NEXT: vldrw.u32 q5, [r0, #128] -; CHECK-NEXT: vadd.i32 q2, q4, q2 -; CHECK-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-NEXT: vmov.f32 s25, s12 -; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vmov.f32 s26, s15 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s30, s14 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vmov.f32 s24, s17 -; CHECK-NEXT: vmov.f32 s27, s22 -; CHECK-NEXT: vmov.f32 s28, s16 -; CHECK-NEXT: vmov.f32 s29, s19 -; CHECK-NEXT: vmov.f32 s31, s21 -; CHECK-NEXT: vadd.i32 q6, q7, q6 -; CHECK-NEXT: vmov.f32 s12, s18 -; CHECK-NEXT: vmov.f32 s14, s20 -; CHECK-NEXT: vmov.f32 s15, s23 -; CHECK-NEXT: vadd.i32 q3, q6, q3 -; CHECK-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: vld3_v16i32: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LV-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-LV-NEXT: vldrw.u32 q6, [r0, #176] +; CHECK-LV-NEXT: vmov.f32 s10, s2 +; CHECK-LV-NEXT: vmov.f32 s13, s0 +; CHECK-LV-NEXT: vmov.f32 s14, s3 +; CHECK-LV-NEXT: vmov.f32 s8, s4 +; CHECK-LV-NEXT: vmov.f32 s9, s7 +; CHECK-LV-NEXT: vmov.f32 s12, s5 +; CHECK-LV-NEXT: vmov.f32 s15, s18 +; CHECK-LV-NEXT: vmov.f32 s11, s17 +; CHECK-LV-NEXT: vadd.i32 q2, q2, q3 +; CHECK-LV-NEXT: vmov.f32 s0, s6 +; CHECK-LV-NEXT: vmov.f32 s2, s16 +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LV-NEXT: vmov.f32 s3, s19 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LV-NEXT: vadd.i32 q0, q2, q0 +; CHECK-LV-NEXT: vldrw.u32 q2, [r0] +; CHECK-LV-NEXT: vmov.f32 s17, s4 +; CHECK-LV-NEXT: vmov.f32 s18, s7 +; CHECK-LV-NEXT: vmov.f32 s22, s6 +; CHECK-LV-NEXT: vmov.f32 s16, s9 +; CHECK-LV-NEXT: vmov.f32 s19, s14 +; CHECK-LV-NEXT: vmov.f32 s20, s8 +; CHECK-LV-NEXT: vmov.f32 s21, s11 +; CHECK-LV-NEXT: vmov.f32 s23, s13 +; CHECK-LV-NEXT: vmov.f32 s4, s10 +; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-LV-NEXT: vmov.f32 s6, s12 +; CHECK-LV-NEXT: vadd.i32 q4, q5, q4 +; CHECK-LV-NEXT: vmov.f32 s7, s15 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #144] +; CHECK-LV-NEXT: vadd.i32 q1, q4, q1 +; CHECK-LV-NEXT: vmov.f32 s18, s10 +; CHECK-LV-NEXT: vmov.f32 s21, s8 +; CHECK-LV-NEXT: vmov.f32 s22, s11 +; CHECK-LV-NEXT: vmov.f32 s16, s12 +; CHECK-LV-NEXT: vmov.f32 s17, s15 +; CHECK-LV-NEXT: vmov.f32 s20, s13 +; CHECK-LV-NEXT: vmov.f32 s23, s26 +; CHECK-LV-NEXT: vmov.f32 s19, s25 +; CHECK-LV-NEXT: vadd.i32 q4, q4, q5 +; CHECK-LV-NEXT: vmov.f32 s8, s14 +; CHECK-LV-NEXT: vmov.f32 s10, s24 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-LV-NEXT: vmov.f32 s11, s27 +; CHECK-LV-NEXT: vldrw.u32 q5, [r0, #128] +; CHECK-LV-NEXT: vadd.i32 q2, q4, q2 +; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #96] +; CHECK-LV-NEXT: vmov.f32 s25, s12 +; CHECK-LV-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-LV-NEXT: vmov.f32 s26, s15 +; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-LV-NEXT: vmov.f32 s30, s14 +; CHECK-LV-NEXT: vstrw.32 q1, [r1] +; CHECK-LV-NEXT: vmov.f32 s24, s17 +; CHECK-LV-NEXT: vmov.f32 s27, s22 +; CHECK-LV-NEXT: vmov.f32 s28, s16 +; CHECK-LV-NEXT: vmov.f32 s29, s19 +; CHECK-LV-NEXT: vmov.f32 s31, s21 +; CHECK-LV-NEXT: vadd.i32 q6, q7, q6 +; CHECK-LV-NEXT: vmov.f32 s12, s18 +; CHECK-LV-NEXT: vmov.f32 s14, s20 +; CHECK-LV-NEXT: vmov.f32 s15, s23 +; CHECK-LV-NEXT: vadd.i32 q3, q6, q3 +; CHECK-LV-NEXT: vstrw.32 q3, [r1, #32] +; CHECK-LV-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LV-NEXT: bx lr + +; CHECK-LIS-LABEL: vld3_v16i32: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-LIS-NEXT: vmov.f32 s10, s2 +; CHECK-LIS-NEXT: vmov.f32 s17, s0 +; CHECK-LIS-NEXT: vmov.f32 s18, s3 +; CHECK-LIS-NEXT: vmov.f32 s8, s4 +; CHECK-LIS-NEXT: vmov.f32 s9, s7 +; CHECK-LIS-NEXT: vmov.f32 s16, s5 +; CHECK-LIS-NEXT: vmov.f32 s19, s14 +; CHECK-LIS-NEXT: vmov.f32 s11, s13 +; CHECK-LIS-NEXT: vmov.f32 s0, s6 +; CHECK-LIS-NEXT: vadd.i32 q2, q2, q4 +; CHECK-LIS-NEXT: vmov.f32 s2, s12 +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LIS-NEXT: vmov.f32 s3, s15 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LIS-NEXT: vadd.i32 q0, q2, q0 +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0] +; CHECK-LIS-NEXT: vmov.f32 s17, s4 +; CHECK-LIS-NEXT: vmov.f32 s18, s7 +; CHECK-LIS-NEXT: vmov.f32 s22, s6 +; CHECK-LIS-NEXT: vmov.f32 s16, s9 +; CHECK-LIS-NEXT: vmov.f32 s19, s14 +; CHECK-LIS-NEXT: vmov.f32 s20, s8 +; CHECK-LIS-NEXT: vmov.f32 s21, s11 +; CHECK-LIS-NEXT: vmov.f32 s23, s13 +; CHECK-LIS-NEXT: vadd.i32 q4, q5, q4 +; CHECK-LIS-NEXT: vmov.f32 s4, s10 +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-LIS-NEXT: vldrw.u32 q5, [r0, #176] +; CHECK-LIS-NEXT: vmov.f32 s6, s12 +; CHECK-LIS-NEXT: vmov.f32 s7, s15 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #144] +; CHECK-LIS-NEXT: vadd.i32 q1, q4, q1 +; CHECK-LIS-NEXT: vmov.f32 s18, s10 +; CHECK-LIS-NEXT: vmov.f32 s25, s8 +; CHECK-LIS-NEXT: vmov.f32 s26, s11 +; CHECK-LIS-NEXT: vmov.f32 s16, s12 +; CHECK-LIS-NEXT: vmov.f32 s17, s15 +; CHECK-LIS-NEXT: vmov.f32 s24, s13 +; CHECK-LIS-NEXT: vmov.f32 s27, s22 +; CHECK-LIS-NEXT: vmov.f32 s19, s21 +; CHECK-LIS-NEXT: vmov.f32 s8, s14 +; CHECK-LIS-NEXT: vadd.i32 q4, q4, q6 +; CHECK-LIS-NEXT: vmov.f32 s10, s20 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-LIS-NEXT: vmov.f32 s11, s23 +; CHECK-LIS-NEXT: vldrw.u32 q5, [r0, #128] +; CHECK-LIS-NEXT: vadd.i32 q2, q4, q2 +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #96] +; CHECK-LIS-NEXT: vmov.f32 s25, s12 +; CHECK-LIS-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-LIS-NEXT: vmov.f32 s26, s15 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-LIS-NEXT: vmov.f32 s30, s14 +; CHECK-LIS-NEXT: vstrw.32 q1, [r1] +; CHECK-LIS-NEXT: vmov.f32 s24, s17 +; CHECK-LIS-NEXT: vmov.f32 s27, s22 +; CHECK-LIS-NEXT: vmov.f32 s28, s16 +; CHECK-LIS-NEXT: vmov.f32 s29, s19 +; CHECK-LIS-NEXT: vmov.f32 s31, s21 +; CHECK-LIS-NEXT: vadd.i32 q6, q7, q6 +; CHECK-LIS-NEXT: vmov.f32 s12, s18 +; CHECK-LIS-NEXT: vmov.f32 s14, s20 +; CHECK-LIS-NEXT: vmov.f32 s15, s23 +; CHECK-LIS-NEXT: vadd.i32 q3, q6, q3 +; CHECK-LIS-NEXT: vstrw.32 q3, [r1, #32] +; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LIS-NEXT: bx lr entry: %l1 = load <48 x i32>, ptr %src, align 4 %s1 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> @@ -792,35 +932,65 @@ entry: ; i64 define void @vld3_v2i64(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v2i64: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s13, s3 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov r0, r3, d5 -; CHECK-NEXT: vmov r2, r4, d3 -; CHECK-NEXT: vmov r6, r7, d0 -; CHECK-NEXT: vmov r5, r8, d6 -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: adds.w r0, r0, lr -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r3, r4 -; CHECK-NEXT: vmov r3, r4, d4 -; CHECK-NEXT: adds r6, r6, r5 -; CHECK-NEXT: adc.w r7, r7, r8 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: adcs r7, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r7, r2 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-LV-LABEL: vld3_v2i64: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-LV-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0] +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-LV-NEXT: vmov.f32 s12, s2 +; CHECK-LV-NEXT: vmov.f32 s13, s3 +; CHECK-LV-NEXT: vmov.f32 s2, s4 +; CHECK-LV-NEXT: vmov.f32 s3, s5 +; CHECK-LV-NEXT: vmov r0, r3, d5 +; CHECK-LV-NEXT: vmov r2, r4, d3 +; CHECK-LV-NEXT: vmov r6, r7, d0 +; CHECK-LV-NEXT: vmov r5, r8, d6 +; CHECK-LV-NEXT: vmov lr, r12, d1 +; CHECK-LV-NEXT: adds.w r0, r0, lr +; CHECK-LV-NEXT: adc.w r3, r3, r12 +; CHECK-LV-NEXT: adds r0, r0, r2 +; CHECK-LV-NEXT: adc.w r2, r3, r4 +; CHECK-LV-NEXT: vmov r3, r4, d4 +; CHECK-LV-NEXT: adds r6, r6, r5 +; CHECK-LV-NEXT: adc.w r7, r7, r8 +; CHECK-LV-NEXT: adds r3, r3, r6 +; CHECK-LV-NEXT: adcs r7, r4 +; CHECK-LV-NEXT: vmov q0[2], q0[0], r3, r0 +; CHECK-LV-NEXT: vmov q0[3], q0[1], r7, r2 +; CHECK-LV-NEXT: vstrw.32 q0, [r1] +; CHECK-LV-NEXT: pop.w {r4, r5, r6, r7, r8, pc} + +; CHECK-LIS-LABEL: vld3_v2i64: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-LIS-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-LIS-NEXT: vmov.f32 s8, s2 +; CHECK-LIS-NEXT: vmov.f32 s9, s3 +; CHECK-LIS-NEXT: vmov.f32 s2, s4 +; CHECK-LIS-NEXT: vmov.f32 s3, s5 +; CHECK-LIS-NEXT: vmov r0, r3, d7 +; CHECK-LIS-NEXT: vmov r2, r4, d3 +; CHECK-LIS-NEXT: vmov r6, r7, d0 +; CHECK-LIS-NEXT: vmov r5, r8, d4 +; CHECK-LIS-NEXT: vmov lr, r12, d1 +; CHECK-LIS-NEXT: adds.w r0, r0, lr +; CHECK-LIS-NEXT: adc.w r3, r3, r12 +; CHECK-LIS-NEXT: adds r0, r0, r2 +; CHECK-LIS-NEXT: adc.w r2, r3, r4 +; CHECK-LIS-NEXT: vmov r3, r4, d6 +; CHECK-LIS-NEXT: adds r6, r6, r5 +; CHECK-LIS-NEXT: adc.w r7, r7, r8 +; CHECK-LIS-NEXT: adds r3, r3, r6 +; CHECK-LIS-NEXT: adcs r7, r4 +; CHECK-LIS-NEXT: vmov q0[2], q0[0], r3, r0 +; CHECK-LIS-NEXT: vmov q0[3], q0[1], r7, r2 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1] +; CHECK-LIS-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <6 x i64>, ptr %src, align 4 %s1 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> @@ -917,28 +1087,28 @@ define void @vld3_v4i64(ptr %src, ptr %dst) { ; CHECK-LIS-NEXT: vmov lr, r12, d1 ; CHECK-LIS-NEXT: vmov.f32 s2, s12 ; CHECK-LIS-NEXT: vmov.f32 s3, s13 -; CHECK-LIS-NEXT: vmov r6, r7, d12 +; CHECK-LIS-NEXT: vmov r7, r6, d12 ; CHECK-LIS-NEXT: adds.w r0, r5, lr ; CHECK-LIS-NEXT: adc.w r5, r4, r12 ; CHECK-LIS-NEXT: adds.w lr, r0, r3 ; CHECK-LIS-NEXT: vmov r4, r2, d8 ; CHECK-LIS-NEXT: adc.w r12, r5, r8 ; CHECK-LIS-NEXT: vmov r5, r0, d10 -; CHECK-LIS-NEXT: adds r6, r6, r4 -; CHECK-LIS-NEXT: adcs r2, r7 -; CHECK-LIS-NEXT: adds r6, r6, r5 +; CHECK-LIS-NEXT: adds r7, r7, r4 +; CHECK-LIS-NEXT: adcs r2, r6 +; CHECK-LIS-NEXT: adds r7, r7, r5 ; CHECK-LIS-NEXT: adc.w r8, r2, r0 -; CHECK-LIS-NEXT: vmov r7, r4, d1 +; CHECK-LIS-NEXT: vmov r6, r4, d1 ; CHECK-LIS-NEXT: vmov r2, r5, d3 ; CHECK-LIS-NEXT: vmov r3, r0, d0 -; CHECK-LIS-NEXT: adds r2, r2, r7 -; CHECK-LIS-NEXT: adc.w r7, r5, r4 +; CHECK-LIS-NEXT: adds r2, r2, r6 +; CHECK-LIS-NEXT: adc.w r6, r5, r4 ; CHECK-LIS-NEXT: vmov r5, r4, d7 ; CHECK-LIS-NEXT: adds r2, r2, r5 -; CHECK-LIS-NEXT: adcs r7, r4 +; CHECK-LIS-NEXT: adcs r6, r4 ; CHECK-LIS-NEXT: vmov r5, r4, d2 -; CHECK-LIS-NEXT: vmov q1[2], q1[0], r6, r2 -; CHECK-LIS-NEXT: vmov q1[3], q1[1], r8, r7 +; CHECK-LIS-NEXT: vmov q1[2], q1[0], r7, r2 +; CHECK-LIS-NEXT: vmov q1[3], q1[1], r8, r6 ; CHECK-LIS-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-LIS-NEXT: adds r3, r3, r5 ; CHECK-LIS-NEXT: adcs r0, r4 @@ -964,19 +1134,33 @@ entry: ; f32 define void @vld3_v2f32(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v2f32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldr s1, [r0, #16] -; CHECK-NEXT: vldr s5, [r0, #20] -; CHECK-NEXT: vmov.f32 s12, s8 -; CHECK-NEXT: vmov.f32 s13, s11 -; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vadd.f32 q0, q3, q0 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vstmia r1, {s0, s1} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: vld3_v2f32: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: vldrw.u32 q2, [r0] +; CHECK-LV-NEXT: vldr s1, [r0, #16] +; CHECK-LV-NEXT: vldr s5, [r0, #20] +; CHECK-LV-NEXT: vmov.f32 s12, s8 +; CHECK-LV-NEXT: vmov.f32 s13, s11 +; CHECK-LV-NEXT: vmov.f32 s0, s9 +; CHECK-LV-NEXT: vadd.f32 q0, q3, q0 +; CHECK-LV-NEXT: vmov.f32 s4, s10 +; CHECK-LV-NEXT: vadd.f32 q0, q0, q1 +; CHECK-LV-NEXT: vstmia r1, {s0, s1} +; CHECK-LV-NEXT: bx lr + +; CHECK-LIS-LABEL: vld3_v2f32: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0] +; CHECK-LIS-NEXT: vldr s5, [r0, #16] +; CHECK-LIS-NEXT: vldr s1, [r0, #20] +; CHECK-LIS-NEXT: vmov.f32 s12, s8 +; CHECK-LIS-NEXT: vmov.f32 s13, s11 +; CHECK-LIS-NEXT: vmov.f32 s4, s9 +; CHECK-LIS-NEXT: vadd.f32 q1, q3, q1 +; CHECK-LIS-NEXT: vmov.f32 s0, s10 +; CHECK-LIS-NEXT: vadd.f32 q0, q1, q0 +; CHECK-LIS-NEXT: vstmia r1, {s0, s1} +; CHECK-LIS-NEXT: bx lr entry: %l1 = load <6 x float>, ptr %src, align 4 %s1 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> @@ -989,29 +1173,53 @@ entry: } define void @vld3_v4f32(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v4f32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s15, s18 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vadd.f32 q2, q2, q3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vadd.f32 q0, q2, q0 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: vld3_v4f32: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9} +; CHECK-LV-NEXT: vpush {d8, d9} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-LV-NEXT: vldrw.u32 q1, [r0] +; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-LV-NEXT: vmov.f32 s10, s2 +; CHECK-LV-NEXT: vmov.f32 s13, s0 +; CHECK-LV-NEXT: vmov.f32 s14, s3 +; CHECK-LV-NEXT: vmov.f32 s8, s4 +; CHECK-LV-NEXT: vmov.f32 s9, s7 +; CHECK-LV-NEXT: vmov.f32 s12, s5 +; CHECK-LV-NEXT: vmov.f32 s15, s18 +; CHECK-LV-NEXT: vmov.f32 s11, s17 +; CHECK-LV-NEXT: vadd.f32 q2, q2, q3 +; CHECK-LV-NEXT: vmov.f32 s0, s6 +; CHECK-LV-NEXT: vmov.f32 s2, s16 +; CHECK-LV-NEXT: vmov.f32 s3, s19 +; CHECK-LV-NEXT: vadd.f32 q0, q2, q0 +; CHECK-LV-NEXT: vstrw.32 q0, [r1] +; CHECK-LV-NEXT: vpop {d8, d9} +; CHECK-LV-NEXT: bx lr + +; CHECK-LIS-LABEL: vld3_v4f32: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9} +; CHECK-LIS-NEXT: vpush {d8, d9} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0] +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-LIS-NEXT: vmov.f32 s10, s2 +; CHECK-LIS-NEXT: vmov.f32 s13, s0 +; CHECK-LIS-NEXT: vmov.f32 s14, s3 +; CHECK-LIS-NEXT: vmov.f32 s8, s4 +; CHECK-LIS-NEXT: vmov.f32 s9, s7 +; CHECK-LIS-NEXT: vmov.f32 s12, s5 +; CHECK-LIS-NEXT: vmov.f32 s15, s18 +; CHECK-LIS-NEXT: vmov.f32 s11, s17 +; CHECK-LIS-NEXT: vadd.f32 q2, q2, q3 +; CHECK-LIS-NEXT: vmov.f32 s0, s6 +; CHECK-LIS-NEXT: vmov.f32 s2, s16 +; CHECK-LIS-NEXT: vmov.f32 s3, s19 +; CHECK-LIS-NEXT: vadd.f32 q0, q2, q0 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1] +; CHECK-LIS-NEXT: vpop {d8, d9} +; CHECK-LIS-NEXT: bx lr entry: %l1 = load <12 x float>, ptr %src, align 4 %s1 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> @@ -1024,46 +1232,87 @@ entry: } define void @vld3_v8f32(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v8f32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s15, s18 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vadd.f32 q2, q2, q3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vadd.f32 q0, q2, q0 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vmov.f32 s20, s8 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vadd.f32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s7, s15 -; CHECK-NEXT: vadd.f32 q1, q4, q1 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: vld3_v8f32: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-LV-NEXT: vmov.f32 s10, s2 +; CHECK-LV-NEXT: vmov.f32 s13, s0 +; CHECK-LV-NEXT: vmov.f32 s14, s3 +; CHECK-LV-NEXT: vmov.f32 s8, s4 +; CHECK-LV-NEXT: vmov.f32 s9, s7 +; CHECK-LV-NEXT: vmov.f32 s12, s5 +; CHECK-LV-NEXT: vmov.f32 s15, s18 +; CHECK-LV-NEXT: vmov.f32 s11, s17 +; CHECK-LV-NEXT: vadd.f32 q2, q2, q3 +; CHECK-LV-NEXT: vmov.f32 s0, s6 +; CHECK-LV-NEXT: vmov.f32 s2, s16 +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LV-NEXT: vmov.f32 s3, s19 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LV-NEXT: vadd.f32 q0, q2, q0 +; CHECK-LV-NEXT: vldrw.u32 q2, [r0] +; CHECK-LV-NEXT: vmov.f32 s17, s4 +; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-LV-NEXT: vmov.f32 s18, s7 +; CHECK-LV-NEXT: vmov.f32 s22, s6 +; CHECK-LV-NEXT: vmov.f32 s16, s9 +; CHECK-LV-NEXT: vmov.f32 s19, s14 +; CHECK-LV-NEXT: vmov.f32 s20, s8 +; CHECK-LV-NEXT: vmov.f32 s21, s11 +; CHECK-LV-NEXT: vmov.f32 s23, s13 +; CHECK-LV-NEXT: vadd.f32 q4, q5, q4 +; CHECK-LV-NEXT: vmov.f32 s4, s10 +; CHECK-LV-NEXT: vmov.f32 s6, s12 +; CHECK-LV-NEXT: vmov.f32 s7, s15 +; CHECK-LV-NEXT: vadd.f32 q1, q4, q1 +; CHECK-LV-NEXT: vstrw.32 q1, [r1] +; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LV-NEXT: bx lr + +; CHECK-LIS-LABEL: vld3_v8f32: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-LIS-NEXT: vmov.f32 s10, s2 +; CHECK-LIS-NEXT: vmov.f32 s13, s0 +; CHECK-LIS-NEXT: vmov.f32 s14, s3 +; CHECK-LIS-NEXT: vmov.f32 s8, s4 +; CHECK-LIS-NEXT: vmov.f32 s9, s7 +; CHECK-LIS-NEXT: vmov.f32 s12, s5 +; CHECK-LIS-NEXT: vmov.f32 s15, s18 +; CHECK-LIS-NEXT: vmov.f32 s11, s17 +; CHECK-LIS-NEXT: vadd.f32 q2, q2, q3 +; CHECK-LIS-NEXT: vmov.f32 s0, s6 +; CHECK-LIS-NEXT: vmov.f32 s2, s16 +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LIS-NEXT: vmov.f32 s3, s19 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LIS-NEXT: vadd.f32 q0, q2, q0 +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0] +; CHECK-LIS-NEXT: vmov.f32 s17, s4 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-LIS-NEXT: vmov.f32 s18, s7 +; CHECK-LIS-NEXT: vmov.f32 s22, s6 +; CHECK-LIS-NEXT: vmov.f32 s16, s9 +; CHECK-LIS-NEXT: vmov.f32 s19, s14 +; CHECK-LIS-NEXT: vmov.f32 s20, s8 +; CHECK-LIS-NEXT: vmov.f32 s21, s11 +; CHECK-LIS-NEXT: vmov.f32 s23, s13 +; CHECK-LIS-NEXT: vadd.f32 q4, q5, q4 +; CHECK-LIS-NEXT: vmov.f32 s4, s10 +; CHECK-LIS-NEXT: vmov.f32 s6, s12 +; CHECK-LIS-NEXT: vmov.f32 s7, s15 +; CHECK-LIS-NEXT: vadd.f32 q1, q4, q1 +; CHECK-LIS-NEXT: vstrw.32 q1, [r1] +; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LIS-NEXT: bx lr entry: %l1 = load <24 x float>, ptr %src, align 4 %s1 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> @@ -1076,80 +1325,155 @@ entry: } define void @vld3_v16f32(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v16f32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s15, s18 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vadd.f32 q2, q2, q3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vadd.f32 q0, q2, q0 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vmov.f32 s20, s8 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vadd.f32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s7, s15 -; CHECK-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-NEXT: vadd.f32 q1, q4, q1 -; CHECK-NEXT: vmov.f32 s18, s10 -; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vmov.f32 s22, s11 -; CHECK-NEXT: vmov.f32 s16, s12 -; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vmov.f32 s23, s26 -; CHECK-NEXT: vmov.f32 s19, s25 -; CHECK-NEXT: vadd.f32 q4, q4, q5 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vmov.f32 s10, s24 -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vmov.f32 s11, s27 -; CHECK-NEXT: vldrw.u32 q5, [r0, #128] -; CHECK-NEXT: vadd.f32 q2, q4, q2 -; CHECK-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-NEXT: vmov.f32 s25, s12 -; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vmov.f32 s26, s15 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s30, s14 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vmov.f32 s24, s17 -; CHECK-NEXT: vmov.f32 s27, s22 -; CHECK-NEXT: vmov.f32 s28, s16 -; CHECK-NEXT: vmov.f32 s29, s19 -; CHECK-NEXT: vmov.f32 s31, s21 -; CHECK-NEXT: vadd.f32 q6, q7, q6 -; CHECK-NEXT: vmov.f32 s12, s18 -; CHECK-NEXT: vmov.f32 s14, s20 -; CHECK-NEXT: vmov.f32 s15, s23 -; CHECK-NEXT: vadd.f32 q3, q6, q3 -; CHECK-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: vld3_v16f32: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LV-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-LV-NEXT: vldrw.u32 q6, [r0, #176] +; CHECK-LV-NEXT: vmov.f32 s10, s2 +; CHECK-LV-NEXT: vmov.f32 s13, s0 +; CHECK-LV-NEXT: vmov.f32 s14, s3 +; CHECK-LV-NEXT: vmov.f32 s8, s4 +; CHECK-LV-NEXT: vmov.f32 s9, s7 +; CHECK-LV-NEXT: vmov.f32 s12, s5 +; CHECK-LV-NEXT: vmov.f32 s15, s18 +; CHECK-LV-NEXT: vmov.f32 s11, s17 +; CHECK-LV-NEXT: vadd.f32 q2, q2, q3 +; CHECK-LV-NEXT: vmov.f32 s0, s6 +; CHECK-LV-NEXT: vmov.f32 s2, s16 +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LV-NEXT: vmov.f32 s3, s19 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LV-NEXT: vadd.f32 q0, q2, q0 +; CHECK-LV-NEXT: vldrw.u32 q2, [r0] +; CHECK-LV-NEXT: vmov.f32 s17, s4 +; CHECK-LV-NEXT: vmov.f32 s18, s7 +; CHECK-LV-NEXT: vmov.f32 s22, s6 +; CHECK-LV-NEXT: vmov.f32 s16, s9 +; CHECK-LV-NEXT: vmov.f32 s19, s14 +; CHECK-LV-NEXT: vmov.f32 s20, s8 +; CHECK-LV-NEXT: vmov.f32 s21, s11 +; CHECK-LV-NEXT: vmov.f32 s23, s13 +; CHECK-LV-NEXT: vmov.f32 s4, s10 +; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-LV-NEXT: vmov.f32 s6, s12 +; CHECK-LV-NEXT: vadd.f32 q4, q5, q4 +; CHECK-LV-NEXT: vmov.f32 s7, s15 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #144] +; CHECK-LV-NEXT: vadd.f32 q1, q4, q1 +; CHECK-LV-NEXT: vmov.f32 s18, s10 +; CHECK-LV-NEXT: vmov.f32 s21, s8 +; CHECK-LV-NEXT: vmov.f32 s22, s11 +; CHECK-LV-NEXT: vmov.f32 s16, s12 +; CHECK-LV-NEXT: vmov.f32 s17, s15 +; CHECK-LV-NEXT: vmov.f32 s20, s13 +; CHECK-LV-NEXT: vmov.f32 s23, s26 +; CHECK-LV-NEXT: vmov.f32 s19, s25 +; CHECK-LV-NEXT: vadd.f32 q4, q4, q5 +; CHECK-LV-NEXT: vmov.f32 s8, s14 +; CHECK-LV-NEXT: vmov.f32 s10, s24 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-LV-NEXT: vmov.f32 s11, s27 +; CHECK-LV-NEXT: vldrw.u32 q5, [r0, #128] +; CHECK-LV-NEXT: vadd.f32 q2, q4, q2 +; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #96] +; CHECK-LV-NEXT: vmov.f32 s25, s12 +; CHECK-LV-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-LV-NEXT: vmov.f32 s26, s15 +; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-LV-NEXT: vmov.f32 s30, s14 +; CHECK-LV-NEXT: vstrw.32 q1, [r1] +; CHECK-LV-NEXT: vmov.f32 s24, s17 +; CHECK-LV-NEXT: vmov.f32 s27, s22 +; CHECK-LV-NEXT: vmov.f32 s28, s16 +; CHECK-LV-NEXT: vmov.f32 s29, s19 +; CHECK-LV-NEXT: vmov.f32 s31, s21 +; CHECK-LV-NEXT: vadd.f32 q6, q7, q6 +; CHECK-LV-NEXT: vmov.f32 s12, s18 +; CHECK-LV-NEXT: vmov.f32 s14, s20 +; CHECK-LV-NEXT: vmov.f32 s15, s23 +; CHECK-LV-NEXT: vadd.f32 q3, q6, q3 +; CHECK-LV-NEXT: vstrw.32 q3, [r1, #32] +; CHECK-LV-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LV-NEXT: bx lr + +; CHECK-LIS-LABEL: vld3_v16f32: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-LIS-NEXT: vldrw.u32 q6, [r0, #176] +; CHECK-LIS-NEXT: vmov.f32 s10, s2 +; CHECK-LIS-NEXT: vmov.f32 s13, s0 +; CHECK-LIS-NEXT: vmov.f32 s14, s3 +; CHECK-LIS-NEXT: vmov.f32 s8, s4 +; CHECK-LIS-NEXT: vmov.f32 s9, s7 +; CHECK-LIS-NEXT: vmov.f32 s12, s5 +; CHECK-LIS-NEXT: vmov.f32 s15, s18 +; CHECK-LIS-NEXT: vmov.f32 s11, s17 +; CHECK-LIS-NEXT: vadd.f32 q2, q2, q3 +; CHECK-LIS-NEXT: vmov.f32 s0, s6 +; CHECK-LIS-NEXT: vmov.f32 s2, s16 +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LIS-NEXT: vmov.f32 s3, s19 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LIS-NEXT: vadd.f32 q0, q2, q0 +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0] +; CHECK-LIS-NEXT: vmov.f32 s17, s4 +; CHECK-LIS-NEXT: vmov.f32 s18, s7 +; CHECK-LIS-NEXT: vmov.f32 s22, s6 +; CHECK-LIS-NEXT: vmov.f32 s16, s9 +; CHECK-LIS-NEXT: vmov.f32 s19, s14 +; CHECK-LIS-NEXT: vmov.f32 s20, s8 +; CHECK-LIS-NEXT: vmov.f32 s21, s11 +; CHECK-LIS-NEXT: vmov.f32 s23, s13 +; CHECK-LIS-NEXT: vmov.f32 s4, s10 +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-LIS-NEXT: vmov.f32 s6, s12 +; CHECK-LIS-NEXT: vadd.f32 q4, q5, q4 +; CHECK-LIS-NEXT: vmov.f32 s7, s15 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #144] +; CHECK-LIS-NEXT: vadd.f32 q1, q4, q1 +; CHECK-LIS-NEXT: vmov.f32 s18, s10 +; CHECK-LIS-NEXT: vmov.f32 s21, s8 +; CHECK-LIS-NEXT: vmov.f32 s22, s11 +; CHECK-LIS-NEXT: vmov.f32 s16, s12 +; CHECK-LIS-NEXT: vmov.f32 s17, s15 +; CHECK-LIS-NEXT: vmov.f32 s20, s13 +; CHECK-LIS-NEXT: vmov.f32 s23, s26 +; CHECK-LIS-NEXT: vmov.f32 s19, s25 +; CHECK-LIS-NEXT: vadd.f32 q4, q4, q5 +; CHECK-LIS-NEXT: vmov.f32 s8, s14 +; CHECK-LIS-NEXT: vmov.f32 s10, s24 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-LIS-NEXT: vmov.f32 s11, s27 +; CHECK-LIS-NEXT: vldrw.u32 q5, [r0, #128] +; CHECK-LIS-NEXT: vadd.f32 q2, q4, q2 +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #96] +; CHECK-LIS-NEXT: vmov.f32 s25, s12 +; CHECK-LIS-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-LIS-NEXT: vmov.f32 s26, s15 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-LIS-NEXT: vmov.f32 s30, s14 +; CHECK-LIS-NEXT: vstrw.32 q1, [r1] +; CHECK-LIS-NEXT: vmov.f32 s24, s17 +; CHECK-LIS-NEXT: vmov.f32 s27, s22 +; CHECK-LIS-NEXT: vmov.f32 s28, s16 +; CHECK-LIS-NEXT: vmov.f32 s29, s19 +; CHECK-LIS-NEXT: vmov.f32 s31, s21 +; CHECK-LIS-NEXT: vadd.f32 q6, q7, q6 +; CHECK-LIS-NEXT: vmov.f32 s12, s18 +; CHECK-LIS-NEXT: vmov.f32 s14, s20 +; CHECK-LIS-NEXT: vmov.f32 s15, s23 +; CHECK-LIS-NEXT: vadd.f32 q3, q6, q3 +; CHECK-LIS-NEXT: vstrw.32 q3, [r1, #32] +; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LIS-NEXT: bx lr entry: %l1 = load <48 x float>, ptr %src, align 4 %s1 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> @@ -1288,86 +1612,167 @@ entry: } define void @vld3_v16f16(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v16f16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vmovx.f16 s6, s2 -; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: vmov.f32 s5, s8 -; CHECK-NEXT: vmovx.f16 s7, s12 -; CHECK-NEXT: vins.f16 s5, s6 -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vins.f16 s6, s7 -; CHECK-NEXT: vmovx.f16 s16, s15 -; CHECK-NEXT: vmov.f32 s7, s14 -; CHECK-NEXT: vmovx.f16 s17, s3 -; CHECK-NEXT: vins.f16 s7, s16 -; CHECK-NEXT: vmovx.f16 s16, s0 -; CHECK-NEXT: vins.f16 s16, s2 -; CHECK-NEXT: vmovx.f16 s2, s1 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmovx.f16 s2, s8 -; CHECK-NEXT: vins.f16 s3, s2 -; CHECK-NEXT: vmovx.f16 s2, s11 -; CHECK-NEXT: vmovx.f16 s18, s10 -; CHECK-NEXT: vins.f16 s10, s2 -; CHECK-NEXT: vmovx.f16 s2, s14 -; CHECK-NEXT: vmovx.f16 s19, s13 -; CHECK-NEXT: vins.f16 s13, s2 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vins.f16 s18, s12 -; CHECK-NEXT: vins.f16 s19, s15 -; CHECK-NEXT: vins.f16 s17, s9 -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vadd.f16 q0, q0, q4 -; CHECK-NEXT: vadd.f16 q3, q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vstrw.32 q3, [r1, #16] -; CHECK-NEXT: vmovx.f16 s14, s6 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vmovx.f16 s14, s9 -; CHECK-NEXT: vmov.f32 s13, s8 -; CHECK-NEXT: vmovx.f16 s18, s10 -; CHECK-NEXT: vins.f16 s13, s14 -; CHECK-NEXT: vmovx.f16 s15, s0 -; CHECK-NEXT: vmov.f32 s14, s11 -; CHECK-NEXT: vins.f16 s18, s0 -; CHECK-NEXT: vins.f16 s14, s15 -; CHECK-NEXT: vmovx.f16 s16, s3 -; CHECK-NEXT: vmov.f32 s15, s2 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vins.f16 s15, s16 -; CHECK-NEXT: vmovx.f16 s16, s4 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmovx.f16 s17, s7 -; CHECK-NEXT: vins.f16 s7, s0 -; CHECK-NEXT: vmovx.f16 s0, s11 -; CHECK-NEXT: vmovx.f16 s19, s1 -; CHECK-NEXT: vins.f16 s10, s0 -; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vins.f16 s1, s0 -; CHECK-NEXT: vins.f16 s16, s6 -; CHECK-NEXT: vmov.f32 s5, s7 -; CHECK-NEXT: vins.f16 s19, s3 -; CHECK-NEXT: vins.f16 s17, s9 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s7, s1 -; CHECK-NEXT: vadd.f16 q0, q1, q4 -; CHECK-NEXT: vadd.f16 q0, q0, q3 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: vld3_v16f16: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9} +; CHECK-LV-NEXT: vpush {d8, d9} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-LV-NEXT: vmovx.f16 s6, s2 +; CHECK-LV-NEXT: vmov.f32 s4, s1 +; CHECK-LV-NEXT: vins.f16 s4, s6 +; CHECK-LV-NEXT: vmovx.f16 s6, s9 +; CHECK-LV-NEXT: vmov.f32 s5, s8 +; CHECK-LV-NEXT: vmovx.f16 s7, s12 +; CHECK-LV-NEXT: vins.f16 s5, s6 +; CHECK-LV-NEXT: vmov.f32 s6, s11 +; CHECK-LV-NEXT: vins.f16 s6, s7 +; CHECK-LV-NEXT: vmovx.f16 s16, s15 +; CHECK-LV-NEXT: vmov.f32 s7, s14 +; CHECK-LV-NEXT: vmovx.f16 s17, s3 +; CHECK-LV-NEXT: vins.f16 s7, s16 +; CHECK-LV-NEXT: vmovx.f16 s16, s0 +; CHECK-LV-NEXT: vins.f16 s16, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s1 +; CHECK-LV-NEXT: vins.f16 s0, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s8 +; CHECK-LV-NEXT: vins.f16 s3, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s11 +; CHECK-LV-NEXT: vmovx.f16 s18, s10 +; CHECK-LV-NEXT: vins.f16 s10, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s14 +; CHECK-LV-NEXT: vmovx.f16 s19, s13 +; CHECK-LV-NEXT: vins.f16 s13, s2 +; CHECK-LV-NEXT: vmov.f32 s1, s3 +; CHECK-LV-NEXT: vins.f16 s18, s12 +; CHECK-LV-NEXT: vins.f16 s19, s15 +; CHECK-LV-NEXT: vins.f16 s17, s9 +; CHECK-LV-NEXT: vmov.f32 s2, s10 +; CHECK-LV-NEXT: vmov.f32 s3, s13 +; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-LV-NEXT: vadd.f16 q0, q0, q4 +; CHECK-LV-NEXT: vadd.f16 q3, q0, q1 +; CHECK-LV-NEXT: vldrw.u32 q1, [r0] +; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-LV-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-LV-NEXT: vmovx.f16 s14, s6 +; CHECK-LV-NEXT: vmov.f32 s12, s5 +; CHECK-LV-NEXT: vins.f16 s12, s14 +; CHECK-LV-NEXT: vmovx.f16 s14, s9 +; CHECK-LV-NEXT: vmov.f32 s13, s8 +; CHECK-LV-NEXT: vmovx.f16 s18, s10 +; CHECK-LV-NEXT: vins.f16 s13, s14 +; CHECK-LV-NEXT: vmovx.f16 s15, s0 +; CHECK-LV-NEXT: vmov.f32 s14, s11 +; CHECK-LV-NEXT: vins.f16 s18, s0 +; CHECK-LV-NEXT: vins.f16 s14, s15 +; CHECK-LV-NEXT: vmovx.f16 s16, s3 +; CHECK-LV-NEXT: vmov.f32 s15, s2 +; CHECK-LV-NEXT: vmovx.f16 s0, s5 +; CHECK-LV-NEXT: vins.f16 s15, s16 +; CHECK-LV-NEXT: vmovx.f16 s16, s4 +; CHECK-LV-NEXT: vins.f16 s4, s0 +; CHECK-LV-NEXT: vmovx.f16 s0, s8 +; CHECK-LV-NEXT: vmovx.f16 s17, s7 +; CHECK-LV-NEXT: vins.f16 s7, s0 +; CHECK-LV-NEXT: vmovx.f16 s0, s11 +; CHECK-LV-NEXT: vmovx.f16 s19, s1 +; CHECK-LV-NEXT: vins.f16 s10, s0 +; CHECK-LV-NEXT: vmovx.f16 s0, s2 +; CHECK-LV-NEXT: vins.f16 s1, s0 +; CHECK-LV-NEXT: vins.f16 s16, s6 +; CHECK-LV-NEXT: vmov.f32 s5, s7 +; CHECK-LV-NEXT: vins.f16 s19, s3 +; CHECK-LV-NEXT: vins.f16 s17, s9 +; CHECK-LV-NEXT: vmov.f32 s6, s10 +; CHECK-LV-NEXT: vmov.f32 s7, s1 +; CHECK-LV-NEXT: vadd.f16 q0, q1, q4 +; CHECK-LV-NEXT: vadd.f16 q0, q0, q3 +; CHECK-LV-NEXT: vstrw.32 q0, [r1] +; CHECK-LV-NEXT: vpop {d8, d9} +; CHECK-LV-NEXT: bx lr + +; CHECK-LIS-LABEL: vld3_v16f16: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9} +; CHECK-LIS-NEXT: vpush {d8, d9} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-LIS-NEXT: vmovx.f16 s6, s2 +; CHECK-LIS-NEXT: vmov.f32 s4, s1 +; CHECK-LIS-NEXT: vins.f16 s4, s6 +; CHECK-LIS-NEXT: vmovx.f16 s6, s9 +; CHECK-LIS-NEXT: vmov.f32 s5, s8 +; CHECK-LIS-NEXT: vmovx.f16 s7, s12 +; CHECK-LIS-NEXT: vins.f16 s5, s6 +; CHECK-LIS-NEXT: vmov.f32 s6, s11 +; CHECK-LIS-NEXT: vins.f16 s6, s7 +; CHECK-LIS-NEXT: vmovx.f16 s16, s15 +; CHECK-LIS-NEXT: vmov.f32 s7, s14 +; CHECK-LIS-NEXT: vmovx.f16 s17, s3 +; CHECK-LIS-NEXT: vins.f16 s7, s16 +; CHECK-LIS-NEXT: vmovx.f16 s16, s0 +; CHECK-LIS-NEXT: vins.f16 s16, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s1 +; CHECK-LIS-NEXT: vins.f16 s0, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s8 +; CHECK-LIS-NEXT: vins.f16 s3, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s11 +; CHECK-LIS-NEXT: vmovx.f16 s18, s10 +; CHECK-LIS-NEXT: vins.f16 s10, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s14 +; CHECK-LIS-NEXT: vmovx.f16 s19, s13 +; CHECK-LIS-NEXT: vins.f16 s13, s2 +; CHECK-LIS-NEXT: vmov.f32 s1, s3 +; CHECK-LIS-NEXT: vins.f16 s18, s12 +; CHECK-LIS-NEXT: vins.f16 s19, s15 +; CHECK-LIS-NEXT: vins.f16 s17, s9 +; CHECK-LIS-NEXT: vmov.f32 s2, s10 +; CHECK-LIS-NEXT: vmov.f32 s3, s13 +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-LIS-NEXT: vadd.f16 q0, q0, q4 +; CHECK-LIS-NEXT: vadd.f16 q3, q0, q1 +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-LIS-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-LIS-NEXT: vmovx.f16 s14, s2 +; CHECK-LIS-NEXT: vmov.f32 s12, s1 +; CHECK-LIS-NEXT: vins.f16 s12, s14 +; CHECK-LIS-NEXT: vmovx.f16 s14, s9 +; CHECK-LIS-NEXT: vmov.f32 s13, s8 +; CHECK-LIS-NEXT: vmovx.f16 s15, s4 +; CHECK-LIS-NEXT: vins.f16 s13, s14 +; CHECK-LIS-NEXT: vmov.f32 s14, s11 +; CHECK-LIS-NEXT: vins.f16 s14, s15 +; CHECK-LIS-NEXT: vmovx.f16 s16, s7 +; CHECK-LIS-NEXT: vmov.f32 s15, s6 +; CHECK-LIS-NEXT: vmovx.f16 s17, s3 +; CHECK-LIS-NEXT: vins.f16 s15, s16 +; CHECK-LIS-NEXT: vmovx.f16 s16, s0 +; CHECK-LIS-NEXT: vins.f16 s16, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s1 +; CHECK-LIS-NEXT: vins.f16 s0, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s8 +; CHECK-LIS-NEXT: vins.f16 s3, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s11 +; CHECK-LIS-NEXT: vmovx.f16 s18, s10 +; CHECK-LIS-NEXT: vins.f16 s10, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s6 +; CHECK-LIS-NEXT: vmovx.f16 s19, s5 +; CHECK-LIS-NEXT: vins.f16 s5, s2 +; CHECK-LIS-NEXT: vmov.f32 s1, s3 +; CHECK-LIS-NEXT: vins.f16 s18, s4 +; CHECK-LIS-NEXT: vins.f16 s19, s7 +; CHECK-LIS-NEXT: vins.f16 s17, s9 +; CHECK-LIS-NEXT: vmov.f32 s2, s10 +; CHECK-LIS-NEXT: vmov.f32 s3, s5 +; CHECK-LIS-NEXT: vadd.f16 q0, q0, q4 +; CHECK-LIS-NEXT: vadd.f16 q0, q0, q3 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1] +; CHECK-LIS-NEXT: vpop {d8, d9} +; CHECK-LIS-NEXT: bx lr entry: %l1 = load <48 x half>, ptr %src, align 4 %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> @@ -1436,4 +1841,4 @@ entry: %a = fadd <4 x double> %a1, %s3 store <4 x double> %a, ptr %dst ret void -} +} \ No newline at end of file