diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 662d84b7a60a8..ac5d614b233b4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -23239,6 +23239,99 @@ static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT)); } +// Helper function to optimize small vector load + extension patterns. +// These patterns would otherwise be scalarized into inefficient sequences. +static SDValue performSmallVectorLoadExtCombine(SDNode *N, SelectionDAG &DAG) { + // Don't optimize if NEON is not available. Without NEON, the backend + // will need to scalarize these operations anyway. + const AArch64Subtarget &Subtarget = DAG.getSubtarget(); + if (!Subtarget.isNeonAvailable()) + return SDValue(); + // Don't optimize if SVE is being used for fixed-length vectors, because it + // has native support for these patterns. + if (Subtarget.useSVEForFixedLengthVectors()) + return SDValue(); + + unsigned Opcode = N->getOpcode(); + if (Opcode != ISD::ZERO_EXTEND && Opcode != ISD::SIGN_EXTEND && + Opcode != ISD::ANY_EXTEND) + return SDValue(); + + SDValue Op = N->getOperand(0); + if (Op.getOpcode() != ISD::LOAD) + return SDValue(); + LoadSDNode *LD = cast(Op); + if (LD->getExtensionType() != ISD::NON_EXTLOAD || !LD->hasOneUse() || + LD->isVolatile()) + return SDValue(); + + EVT MemVT = LD->getMemoryVT(); + EVT ResVT = N->getValueType(0); + // Check if this is a small vector pattern we want to optimize. + if (MemVT != MVT::v2i8 && MemVT != MVT::v2i16) + return SDValue(); + + unsigned NumElts = MemVT.getVectorNumElements(); + unsigned SrcEltBits = MemVT.getScalarSizeInBits(); + unsigned DstEltBits = ResVT.getScalarSizeInBits(); + unsigned LoadBits = NumElts * SrcEltBits; + + // Check alignment: the optimization loads a larger scalar, which may be + // unaligned, compared to what the original load will be legalized into. + Align Alignment = LD->getAlign(); + if (Subtarget.requiresStrictAlign() && Alignment < LoadBits) + return SDValue(); + + // The transformation strategy: + // 1. Load the memory as a large scalar and turn it into a 64-bit vector. + // 2. Bitcast to a narrow type (v8i8 or v4i16) that has efficient NEON extend. + // 3. Extend using ushll/sshll, extract subvector, repeat as needed. + + // For ANY_EXTEND, we can choose either sign or zero extend - zero is + // typically cheaper. + if (Opcode == ISD::ANY_EXTEND) + Opcode = ISD::ZERO_EXTEND; + + SDLoc DL(N); + SDValue Chain = LD->getChain(); + SDValue BasePtr = LD->getBasePtr(); + const MachinePointerInfo &PtrInfo = LD->getPointerInfo(); + MVT LoadTy = MVT::getIntegerVT(LoadBits); + SDValue Load = DAG.getLoad(LoadTy, DL, Chain, BasePtr, PtrInfo, Alignment); + + // SCALAR_TO_VECTOR needs to create a 64-bit vector for NEON instructions. + // The scalar load is inserted into the lower bits of a 64-bit register. + // We determine the appropriate 64-bit vector type based on load size, + // then bitcast to v8i8 or v4i16 for efficient ushll/sshll extends. + MVT ScalarVecVT = MVT::getVectorVT(LoadTy, 64 / LoadBits); + MVT NarrowVT = MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(), + 64 / MemVT.getScalarSizeInBits()); + + SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ScalarVecVT, Load); + Vec = DAG.getNode(ISD::BITCAST, DL, NarrowVT, Vec); + // Extend iteratively: each extend doubles the element size. + // We extend the full 64-bit vector to leverage NEON ushll/sshll instructions. + while (Vec.getScalarValueSizeInBits() < DstEltBits) { + MVT CurVT = Vec.getSimpleValueType(); + unsigned NextBits = CurVT.getScalarSizeInBits() * 2; + MVT WideVT = MVT::getVectorVT(MVT::getIntegerVT(NextBits), + CurVT.getVectorNumElements()); + Vec = DAG.getNode(Opcode, DL, WideVT, Vec); + + // Extract only when: excess elements + still wide + done extending. + bool HasExcess = WideVT.getVectorNumElements() > NumElts; + bool StaysWide = WideVT.getSizeInBits() >= 64; + bool IsDone = NextBits >= DstEltBits; + if (HasExcess && StaysWide && IsDone) { + MVT ExtractVT = MVT::getVectorVT(WideVT.getScalarType(), NumElts); + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Vec, + DAG.getConstant(0, DL, MVT::i64)); + } + } + + return DAG.getMergeValues({Vec, Load.getValue(1)}, DL); +} + static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -23288,6 +23381,12 @@ static SDValue performExtendCombine(SDNode *N, NewAnyExtend); } + // Try to optimize small vector load + extension patterns + + // Try to optimize small vector load + extension patterns + if (SDValue Result = performSmallVectorLoadExtCombine(N, DAG)) + return Result; + return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll index 317feb5ad9ad0..bc0edc9b5eca6 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll @@ -22,17 +22,16 @@ define <2 x i16> @test0(ptr %i16_ptr, i64 %inc) { define <2 x i16> @test1(ptr %v2i16_ptr) { ; CHECK-LE-LABEL: test1: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-LE-NEXT: add x8, x0, #2 -; CHECK-LE-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-LE-NEXT: ldr s0, [x0] +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test1: ; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-BE-NEXT: add x8, x0, #2 -; CHECK-BE-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-BE-NEXT: ldr s0, [x0] +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s ; CHECK-BE-NEXT: ret %v2i16 = load <2 x i16>, ptr %v2i16_ptr @@ -66,17 +65,18 @@ define <2 x i16> @test2(ptr %i16_ptr, i64 %inc) { define <2 x i8> @test3(ptr %v2i8_ptr) { ; CHECK-LE-LABEL: test3: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-LE-NEXT: add x8, x0, #1 -; CHECK-LE-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test3: ; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-BE-NEXT: add x8, x0, #1 -; CHECK-BE-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s ; CHECK-BE-NEXT: ret %v2i8 = load <2 x i8>, ptr %v2i8_ptr @@ -105,19 +105,18 @@ define <4 x i8> @test4(ptr %v4i8_ptr) { define <2 x i32> @fsext_v2i32(ptr %a) { ; CHECK-LE-LABEL: fsext_v2i32: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: ldrsb w8, [x0] -; CHECK-LE-NEXT: ldrsb w9, [x0, #1] -; CHECK-LE-NEXT: fmov s0, w8 -; CHECK-LE-NEXT: mov v0.s[1], w9 +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: fsext_v2i32: ; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: ldrsb w8, [x0] -; CHECK-BE-NEXT: ldrsb w9, [x0, #1] -; CHECK-BE-NEXT: fmov s0, w8 -; CHECK-BE-NEXT: mov v0.s[1], w9 +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s ; CHECK-BE-NEXT: ret %x = load <2 x i8>, ptr %a @@ -249,19 +248,18 @@ define i32 @loadExti32(ptr %ref) { define <2 x i16> @fsext_v2i16(ptr %a) { ; CHECK-LE-LABEL: fsext_v2i16: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: ldrsb w8, [x0] -; CHECK-LE-NEXT: ldrsb w9, [x0, #1] -; CHECK-LE-NEXT: fmov s0, w8 -; CHECK-LE-NEXT: mov v0.s[1], w9 +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: fsext_v2i16: ; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: ldrsb w8, [x0] -; CHECK-BE-NEXT: ldrsb w9, [x0, #1] -; CHECK-BE-NEXT: fmov s0, w8 -; CHECK-BE-NEXT: mov v0.s[1], w9 +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s ; CHECK-BE-NEXT: ret %x = load <2 x i8>, ptr %a @@ -497,3 +495,219 @@ define <4 x i8> @strict_align_unaligned(ptr %v4i8_ptr) "target-features"="+stric %v4i8 = load <4 x i8>, ptr %v4i8_ptr, align 1 ret <4 x i8> %v4i8 } + +define <2 x i16> @zext_v2i8_v2i16(ptr %a) { +; CHECK-LE-LABEL: zext_v2i8_v2i16: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: zext_v2i8_v2i16: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %x = load <2 x i8>, ptr %a + %y = zext <2 x i8> %x to <2 x i16> + ret <2 x i16> %y +} + +define <2 x i32> @zext_v2i8_v2i32(ptr %a) { +; CHECK-LE-LABEL: zext_v2i8_v2i32: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: zext_v2i8_v2i32: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %x = load <2 x i8>, ptr %a + %y = zext <2 x i8> %x to <2 x i32> + ret <2 x i32> %y +} + +define <2 x i64> @zext_v2i8_v2i64(ptr %a) { +; CHECK-LE-LABEL: zext_v2i8_v2i64: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: zext_v2i8_v2i64: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: ret + %x = load <2 x i8>, ptr %a + %y = zext <2 x i8> %x to <2 x i64> + ret <2 x i64> %y +} + +define <2 x i32> @zext_v2i16_v2i32(ptr %a) { +; CHECK-LE-LABEL: zext_v2i16_v2i32: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr s0, [x0] +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: zext_v2i16_v2i32: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr s0, [x0] +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %x = load <2 x i16>, ptr %a + %y = zext <2 x i16> %x to <2 x i32> + ret <2 x i32> %y +} + +define <2 x i64> @zext_v2i16_v2i64(ptr %a) { +; CHECK-LE-LABEL: zext_v2i16_v2i64: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr s0, [x0] +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: zext_v2i16_v2i64: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr s0, [x0] +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: ret + %x = load <2 x i16>, ptr %a + %y = zext <2 x i16> %x to <2 x i64> + ret <2 x i64> %y +} + +define <2 x i16> @sext_v2i8_v2i16(ptr %a) { +; CHECK-LE-LABEL: sext_v2i8_v2i16: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: sext_v2i8_v2i16: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %x = load <2 x i8>, ptr %a + %y = sext <2 x i8> %x to <2 x i16> + ret <2 x i16> %y +} + +define <2 x i32> @sext_v2i8_v2i32(ptr %a) { +; CHECK-LE-LABEL: sext_v2i8_v2i32: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: sext_v2i8_v2i32: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %x = load <2 x i8>, ptr %a + %y = sext <2 x i8> %x to <2 x i32> + ret <2 x i32> %y +} + +define <2 x i64> @sext_v2i8_v2i64(ptr %a) { +; CHECK-LE-LABEL: sext_v2i8_v2i64: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: sext_v2i8_v2i64: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: ret + %x = load <2 x i8>, ptr %a + %y = sext <2 x i8> %x to <2 x i64> + ret <2 x i64> %y +} + +define <2 x i32> @sext_v2i16_v2i32(ptr %a) { +; CHECK-LE-LABEL: sext_v2i16_v2i32: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr s0, [x0] +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: sext_v2i16_v2i32: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr s0, [x0] +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %x = load <2 x i16>, ptr %a + %y = sext <2 x i16> %x to <2 x i32> + ret <2 x i32> %y +} + +define <2 x i64> @sext_v2i16_v2i64(ptr %a) { +; CHECK-LE-LABEL: sext_v2i16_v2i64: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr s0, [x0] +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: sext_v2i16_v2i64: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr s0, [x0] +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: ret + %x = load <2 x i16>, ptr %a + %y = sext <2 x i16> %x to <2 x i64> + ret <2 x i64> %y +} diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll index 6e5c666bdbc75..2cd54d4113542 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -222,23 +222,17 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind { define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind { ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64: ; CHECK-NEON: // %bb.0: -; CHECK-NEON-NEXT: ldrh w8, [x0] -; CHECK-NEON-NEXT: ldrh w9, [x0, #2] +; CHECK-NEON-NEXT: ldr s0, [x0] ; CHECK-NEON-NEXT: ldr d1, [x1] -; CHECK-NEON-NEXT: fmov d0, x8 -; CHECK-NEON-NEXT: mov v0.d[1], x9 -; CHECK-NEON-NEXT: xtn v0.2s, v0.2d +; CHECK-NEON-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64: ; CHECK-SVE: // %bb.0: -; CHECK-SVE-NEXT: ldrh w8, [x0] -; CHECK-SVE-NEXT: ldrh w9, [x0, #2] +; CHECK-SVE-NEXT: ldr s0, [x0] ; CHECK-SVE-NEXT: ldr d1, [x1] -; CHECK-SVE-NEXT: fmov d0, x8 -; CHECK-SVE-NEXT: mov v0.d[1], x9 -; CHECK-SVE-NEXT: xtn v0.2s, v0.2d +; CHECK-SVE-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-SVE-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll index cabb0e7278e40..d646cfe9072b5 100644 --- a/llvm/test/CodeGen/AArch64/extbinopload.ll +++ b/llvm/test/CodeGen/AArch64/extbinopload.ll @@ -263,16 +263,16 @@ define <16 x i16> @load_v16i8(ptr %p) { define <2 x i16> @std_v2i8_v2i16(ptr %p) { ; CHECK-LABEL: std_v2i8_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #2] -; CHECK-NEXT: ldrb w9, [x0, #3] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: mov v1.s[1], w9 -; CHECK-NEXT: shl v0.2s, v0.2s, #3 -; CHECK-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ldr h0, [x0, #2] +; CHECK-NEXT: ldr h1, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: mov h2, v0.h[0] +; CHECK-NEXT: mov h3, v1.h[0] +; CHECK-NEXT: mov v2.h[2], v0.h[1] +; CHECK-NEXT: mov v3.h[2], v1.h[1] +; CHECK-NEXT: shl v0.2s, v2.2s, #3 +; CHECK-NEXT: add v0.2s, v3.2s, v0.2s ; CHECK-NEXT: ret %l1 = load <2 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 2 diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll index c4bb6e37d6eaf..b138fa4085427 100644 --- a/llvm/test/CodeGen/AArch64/load.ll +++ b/llvm/test/CodeGen/AArch64/load.ll @@ -230,9 +230,9 @@ define <2 x i64> @load_v2i64(ptr %ptr) { define <2 x i8> @load_v2i8(ptr %ptr, <2 x i8> %b) { ; CHECK-SD-LABEL: load_v2i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: ret ; @@ -269,9 +269,8 @@ define <32 x i8> @load_v32i8(ptr %ptr) { define <2 x i16> @load_v2i16(ptr %ptr) { ; CHECK-SD-LABEL: load_v2i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: ret ;