diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 75c4fabe03dd4..20283ad8f2689 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -877,7 +877,21 @@ END_TWO_BYTE_PACK() /// Return true if there are exactly NUSES uses of the indicated value. /// This method ignores uses of other values defined by this operation. - bool hasNUsesOfValue(unsigned NUses, unsigned Value) const; + bool hasNUsesOfValue(unsigned NUses, unsigned Value) const { + assert(Value < getNumValues() && "Bad value!"); + + // TODO: Only iterate over uses of a given value of the node + for (SDUse &U : uses()) { + if (U.getResNo() == Value) { + if (NUses == 0) + return false; + --NUses; + } + } + + // Found exactly the right number of uses? + return NUses == 0; + } /// Return true if there are any use of the indicated value. /// This method ignores uses of other values defined by this operation. diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index a4c3d042fe3a4..8000c8f68ac04 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1817,7 +1817,7 @@ class TargetLoweringBase { EVT NewVT) const { // By default, assume that it is cheaper to extract a subvector from a wide // vector load rather than creating multiple narrow vector loads. - if (NewVT.isVector() && !Load->hasOneUse()) + if (NewVT.isVector() && !SDValue(Load, 0).hasOneUse()) return false; return true; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 0a3210a10d394..9e61df7047d4a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -12444,25 +12444,6 @@ const EVT *SDNode::getValueTypeList(MVT VT) { return &SimpleVTArray.VTs[VT.SimpleTy]; } -/// hasNUsesOfValue - Return true if there are exactly NUSES uses of the -/// indicated value. This method ignores uses of other values defined by this -/// operation. -bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const { - assert(Value < getNumValues() && "Bad value!"); - - // TODO: Only iterate over uses of a given value of the node - for (SDUse &U : uses()) { - if (U.getResNo() == Value) { - if (NUses == 0) - return false; - --NUses; - } - } - - // Found exactly the right number of uses? - return NUses == 0; -} - /// hasAnyUseOfValue - Return true if there are any use of the indicated /// value. This method ignores uses of other values defined by this operation. bool SDNode::hasAnyUseOfValue(unsigned Value) const { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3eccf05e0014a..1883337e0ef3a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3268,7 +3268,8 @@ bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, // those uses are extracted directly into a store, then the extract + store // can be store-folded. Therefore, it's probably not worth splitting the load. EVT VT = Load->getValueType(0); - if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) { + if ((VT.is256BitVector() || VT.is512BitVector()) && + !SDValue(Load, 0).hasOneUse()) { for (SDUse &Use : Load->uses()) { // Skip uses of the chain value. Result 0 of the node is the load value. if (Use.getResNo() != 0) diff --git a/llvm/test/CodeGen/AArch64/merge-store.ll b/llvm/test/CodeGen/AArch64/merge-store.ll index 6653984562ae6..74e3a6d27d3e0 100644 --- a/llvm/test/CodeGen/AArch64/merge-store.ll +++ b/llvm/test/CodeGen/AArch64/merge-store.ll @@ -11,14 +11,14 @@ define void @blam() { ; SPLITTING-NEXT: adrp x8, g1 ; SPLITTING-NEXT: add x8, x8, :lo12:g1 ; SPLITTING-NEXT: adrp x9, g0 -; SPLITTING-NEXT: ldr q0, [x9, :lo12:g0] +; SPLITTING-NEXT: ldr d0, [x9, :lo12:g0] ; SPLITTING-NEXT: str d0, [x8] ; SPLITTING-NEXT: ret ; ; MISALIGNED-LABEL: blam: ; MISALIGNED: // %bb.0: ; MISALIGNED-NEXT: adrp x8, g0 -; MISALIGNED-NEXT: ldr q0, [x8, :lo12:g0] +; MISALIGNED-NEXT: ldr d0, [x8, :lo12:g0] ; MISALIGNED-NEXT: adrp x8, g1 ; MISALIGNED-NEXT: add x8, x8, :lo12:g1 ; MISALIGNED-NEXT: str d0, [x8] diff --git a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll index 8d028c11b4a6b..15bf6a45f7541 100644 --- a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll +++ b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll @@ -531,18 +531,18 @@ define void @quux() #1 { ; CHECK-NEXT: ldr x18, [x19, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldr x0, [x19, #72] // 8-byte Folded Reload ; CHECK-NEXT: ldr x1, [x19, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldr x15, [x19, #224] // 8-byte Folded Reload ; CHECK-NEXT: ldr x2, [x19, #216] // 8-byte Folded Reload ; CHECK-NEXT: ldr x3, [x19, #120] // 8-byte Folded Reload ; CHECK-NEXT: ldr x4, [x19, #112] // 8-byte Folded Reload ; CHECK-NEXT: ldr x5, [x19, #104] // 8-byte Folded Reload ; CHECK-NEXT: ldr x6, [x19, #96] // 8-byte Folded Reload -; CHECK-NEXT: ldr x7, [x19, #224] // 8-byte Folded Reload -; CHECK-NEXT: ldr x20, [x19, #152] // 8-byte Folded Reload -; CHECK-NEXT: ldr x21, [x19, #144] // 8-byte Folded Reload -; CHECK-NEXT: ldr x22, [x19, #136] // 8-byte Folded Reload -; CHECK-NEXT: ldr x23, [x19, #128] // 8-byte Folded Reload -; CHECK-NEXT: ldr x16, [x19, #200] // 8-byte Folded Reload -; CHECK-NEXT: ldr x15, [x19, #208] // 8-byte Folded Reload +; CHECK-NEXT: ldr x16, [x19, #152] // 8-byte Folded Reload +; CHECK-NEXT: ldr x7, [x19, #144] // 8-byte Folded Reload +; CHECK-NEXT: ldr x20, [x19, #136] // 8-byte Folded Reload +; CHECK-NEXT: ldr x21, [x19, #128] // 8-byte Folded Reload +; CHECK-NEXT: ldr x23, [x19, #200] // 8-byte Folded Reload +; CHECK-NEXT: ldr x22, [x19, #208] // 8-byte Folded Reload ; CHECK-NEXT: ldr x24, [x19, #192] // 8-byte Folded Reload ; CHECK-NEXT: ldr x26, [x19, #176] // 8-byte Folded Reload ; CHECK-NEXT: ldr x25, [x19, #184] // 8-byte Folded Reload @@ -562,36 +562,34 @@ define void @quux() #1 { ; CHECK-NEXT: add x25, x25, x27, lsl #2 ; CHECK-NEXT: str x25, [x26] ; CHECK-NEXT: ldr p0, [x24] -; CHECK-NEXT: ldr x24, [x16] +; CHECK-NEXT: ldr x24, [x23] ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x24] ; CHECK-NEXT: mov z0.d, z16.d ; CHECK-NEXT: mov z1.d, z24.d ; CHECK-NEXT: st1w { z1.s }, p2, [x13, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p2, [x13] -; CHECK-NEXT: ldr x24, [x15] -; CHECK-NEXT: ldr x15, [x16] -; CHECK-NEXT: add x15, x15, x24, lsl #2 -; CHECK-NEXT: str x15, [x16] -; CHECK-NEXT: mov x16, x2 -; CHECK-NEXT: incd x16 +; CHECK-NEXT: ldr x24, [x22] +; CHECK-NEXT: ldr x22, [x23] +; CHECK-NEXT: add x22, x22, x24, lsl #2 +; CHECK-NEXT: str x22, [x23] ; CHECK-NEXT: ldr p1, [x2] -; CHECK-NEXT: mov x15, x7 -; CHECK-NEXT: incd x15 -; CHECK-NEXT: ldr p0, [x7] +; CHECK-NEXT: ldr p0, [x15] ; CHECK-NEXT: ld1w { z1.s }, p2/z, [x14] ; CHECK-NEXT: ld1w { z0.s }, p2/z, [x13] -; CHECK-NEXT: str p1, [x23] -; CHECK-NEXT: str p0, [x22] -; CHECK-NEXT: st1w { z1.s }, p2, [x21] -; CHECK-NEXT: st1w { z0.s }, p2, [x20] -; CHECK-NEXT: ldr p0, [x23] -; CHECK-NEXT: ldr p1, [x22] -; CHECK-NEXT: ld1w { z0.s }, p2/z, [x21] -; CHECK-NEXT: ld1w { z1.s }, p2/z, [x20] +; CHECK-NEXT: str p1, [x21] +; CHECK-NEXT: str p0, [x20] +; CHECK-NEXT: st1w { z1.s }, p2, [x7] +; CHECK-NEXT: st1w { z0.s }, p2, [x16] +; CHECK-NEXT: ldr p0, [x21] +; CHECK-NEXT: ldr p1, [x20] +; CHECK-NEXT: ld1w { z0.s }, p2/z, [x7] +; CHECK-NEXT: ld1w { z1.s }, p2/z, [x16] ; CHECK-NEXT: fmopa za0.s, p0/m, p1/m, z0.s, z1.s +; CHECK-NEXT: mov x16, x2 +; CHECK-NEXT: incd x16 ; CHECK-NEXT: ldr p1, [x16] -; CHECK-NEXT: ldr p0, [x7] +; CHECK-NEXT: ldr p0, [x15] ; CHECK-NEXT: ld1w { z1.s }, p2/z, [x14, #1, mul vl] ; CHECK-NEXT: ld1w { z0.s }, p2/z, [x13] ; CHECK-NEXT: str p1, [x6] @@ -604,6 +602,7 @@ define void @quux() #1 { ; CHECK-NEXT: ld1w { z1.s }, p2/z, [x3] ; CHECK-NEXT: fmopa za1.s, p0/m, p1/m, z0.s, z1.s ; CHECK-NEXT: ldr p1, [x2] +; CHECK-NEXT: incd x15 ; CHECK-NEXT: ldr p0, [x15] ; CHECK-NEXT: ld1w { z1.s }, p2/z, [x14] ; CHECK-NEXT: ld1w { z0.s }, p2/z, [x13, #1, mul vl] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll index 0f8f4a6843eae..8fac0e1067684 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll @@ -31,9 +31,7 @@ define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) vscale_range(2,0) #0 { define void @extract_subvector_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl32 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 +; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %op = load <32 x i8>, ptr %a @@ -43,22 +41,13 @@ define void @extract_subvector_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { } define void @extract_subvector_v64i8(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: extract_subvector_v64i8: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 -; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] -; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: extract_subvector_v64i8: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.b, vl64 -; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] -; VBITS_GE_512-NEXT: ptrue p0.b, vl32 -; VBITS_GE_512-NEXT: ext z0.b, z0.b, z0.b, #32 -; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x1] -; VBITS_GE_512-NEXT: ret +; CHECK-LABEL: extract_subvector_v64i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: mov w8, #32 // =0x20 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: st1b { z0.b }, p0, [x1] +; CHECK-NEXT: ret %op = load <64 x i8>, ptr %a %ret = call <32 x i8> @llvm.vector.extract.v32i8.v64i8(<64 x i8> %op, i64 32) store <32 x i8> %ret, ptr %b @@ -68,10 +57,9 @@ define void @extract_subvector_v64i8(ptr %a, ptr %b) #0 { define void @extract_subvector_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: extract_subvector_v128i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl128 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.b, vl64 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64 +; CHECK-NEXT: mov w8, #64 // =0x40 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; CHECK-NEXT: st1b { z0.b }, p0, [x1] ; CHECK-NEXT: ret %op = load <128 x i8>, ptr %a @@ -83,10 +71,9 @@ define void @extract_subvector_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { define void @extract_subvector_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: extract_subvector_v256i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl256 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.b, vl128 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 +; CHECK-NEXT: mov w8, #128 // =0x80 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; CHECK-NEXT: st1b { z0.b }, p0, [x1] ; CHECK-NEXT: ret %op = load <256 x i8>, ptr %a @@ -123,9 +110,7 @@ define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) vscale_range(2,0) #0 { define void @extract_subvector_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 +; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %op = load <16 x i16>, ptr %a @@ -135,22 +120,13 @@ define void @extract_subvector_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { } define void @extract_subvector_v32i16(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: extract_subvector_v32i16: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] -; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: extract_subvector_v32i16: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: ext z0.b, z0.b, z0.b, #32 -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] -; VBITS_GE_512-NEXT: ret +; CHECK-LABEL: extract_subvector_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: mov x8, #16 // =0x10 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: ret %op = load <32 x i16>, ptr %a %ret = call <16 x i16> @llvm.vector.extract.v16i16.v32i16(<32 x i16> %op, i64 16) store <16 x i16> %ret, ptr %b @@ -160,10 +136,9 @@ define void @extract_subvector_v32i16(ptr %a, ptr %b) #0 { define void @extract_subvector_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: extract_subvector_v64i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64 +; CHECK-NEXT: mov x8, #32 // =0x20 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %op = load <64 x i16>, ptr %a @@ -175,10 +150,9 @@ define void @extract_subvector_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { define void @extract_subvector_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: extract_subvector_v128i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 +; CHECK-NEXT: mov x8, #64 // =0x40 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %op = load <128 x i16>, ptr %a @@ -214,9 +188,7 @@ define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) vscale_range(2,0) #0 { define void @extract_subvector_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 +; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %op = load <8 x i32>, ptr %a @@ -226,22 +198,13 @@ define void @extract_subvector_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { } define void @extract_subvector_v16i32(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: extract_subvector_v16i32: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] -; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: extract_subvector_v16i32: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ext z0.b, z0.b, z0.b, #32 -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] -; VBITS_GE_512-NEXT: ret +; CHECK-LABEL: extract_subvector_v16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: mov x8, #8 // =0x8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: ret %op = load <16 x i32>, ptr %a %ret = call <8 x i32> @llvm.vector.extract.v8i32.v16i32(<16 x i32> %op, i64 8) store <8 x i32> %ret, ptr %b @@ -251,10 +214,9 @@ define void @extract_subvector_v16i32(ptr %a, ptr %b) #0 { define void @extract_subvector_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: extract_subvector_v32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64 +; CHECK-NEXT: mov x8, #16 // =0x10 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %op = load <32 x i32>, ptr %a @@ -266,10 +228,9 @@ define void @extract_subvector_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { define void @extract_subvector_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: extract_subvector_v64i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 +; CHECK-NEXT: mov x8, #32 // =0x20 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %op = load <64 x i32>, ptr %a @@ -294,9 +255,7 @@ define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) vscale_range(2,0) #0 { define void @extract_subvector_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 +; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %op = load <4 x i64>, ptr %a @@ -331,6 +290,14 @@ define void @extract_subvector_v16i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] ; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: extract_subvector_v16i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_512-NEXT: ret %op = load <16 x i64>, ptr %a %ret = call <8 x i64> @llvm.vector.extract.v8i64.v16i64(<16 x i64> %op, i64 8) store <8 x i64> %ret, ptr %b @@ -378,9 +345,7 @@ define <4 x half> @extract_subvector_v8f16(<8 x half> %op) vscale_range(2,0) #0 define void @extract_subvector_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 +; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %op = load <16 x half>, ptr %a @@ -390,22 +355,13 @@ define void @extract_subvector_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { } define void @extract_subvector_v32f16(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: extract_subvector_v32f16: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] -; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: extract_subvector_v32f16: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: ext z0.b, z0.b, z0.b, #32 -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] -; VBITS_GE_512-NEXT: ret +; CHECK-LABEL: extract_subvector_v32f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: mov x8, #16 // =0x10 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: ret %op = load <32 x half>, ptr %a %ret = call <16 x half> @llvm.vector.extract.v16f16.v32f16(<32 x half> %op, i64 16) store <16 x half> %ret, ptr %b @@ -415,10 +371,9 @@ define void @extract_subvector_v32f16(ptr %a, ptr %b) #0 { define void @extract_subvector_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: extract_subvector_v64f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64 +; CHECK-NEXT: mov x8, #32 // =0x20 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %op = load <64 x half>, ptr %a @@ -430,10 +385,9 @@ define void @extract_subvector_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { define void @extract_subvector_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: extract_subvector_v128f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 +; CHECK-NEXT: mov x8, #64 // =0x40 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %op = load <128 x half>, ptr %a @@ -469,9 +423,7 @@ define <2 x float> @extract_subvector_v4f32(<4 x float> %op) vscale_range(2,0) # define void @extract_subvector_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 +; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %op = load <8 x float>, ptr %a @@ -481,22 +433,13 @@ define void @extract_subvector_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { } define void @extract_subvector_v16f32(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: extract_subvector_v16f32: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] -; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: extract_subvector_v16f32: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ext z0.b, z0.b, z0.b, #32 -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] -; VBITS_GE_512-NEXT: ret +; CHECK-LABEL: extract_subvector_v16f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: mov x8, #8 // =0x8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: ret %op = load <16 x float>, ptr %a %ret = call <8 x float> @llvm.vector.extract.v8f32.v16f32(<16 x float> %op, i64 8) store <8 x float> %ret, ptr %b @@ -506,10 +449,9 @@ define void @extract_subvector_v16f32(ptr %a, ptr %b) #0 { define void @extract_subvector_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: extract_subvector_v32f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64 +; CHECK-NEXT: mov x8, #16 // =0x10 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %op = load <32 x float>, ptr %a @@ -521,10 +463,9 @@ define void @extract_subvector_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { define void @extract_subvector_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: extract_subvector_v64f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 +; CHECK-NEXT: mov x8, #32 // =0x20 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %op = load <64 x float>, ptr %a @@ -549,9 +490,7 @@ define <1 x double> @extract_subvector_v2f64(<2 x double> %op) vscale_range(2,0) define void @extract_subvector_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 +; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %op = load <4 x double>, ptr %a @@ -561,22 +500,13 @@ define void @extract_subvector_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { } define void @extract_subvector_v8f64(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: extract_subvector_v8f64: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] -; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: extract_subvector_v8f64: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ptrue p0.d, vl4 -; VBITS_GE_512-NEXT: ext z0.b, z0.b, z0.b, #32 -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] -; VBITS_GE_512-NEXT: ret +; CHECK-LABEL: extract_subvector_v8f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: mov x8, #4 // =0x4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-NEXT: ret %op = load <8 x double>, ptr %a %ret = call <4 x double> @llvm.vector.extract.v4f64.v8f64(<8 x double> %op, i64 4) store <4 x double> %ret, ptr %b @@ -586,10 +516,9 @@ define void @extract_subvector_v8f64(ptr %a, ptr %b) #0 { define void @extract_subvector_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: extract_subvector_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64 +; CHECK-NEXT: mov x8, #8 // =0x8 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret %op = load <16 x double>, ptr %a @@ -601,10 +530,9 @@ define void @extract_subvector_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { define void @extract_subvector_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: extract_subvector_v32f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 +; CHECK-NEXT: mov x8, #16 // =0x10 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret %op = load <32 x double>, ptr %a diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll index 1a0aa09e2e40a..ec06b0e5b3d04 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll @@ -56,14 +56,14 @@ define amdgpu_kernel void @v6i16_arg(<6 x i16> %in) nounwind { } ; FUNC-LABEL: {{^}}v5i32_arg: -; GCN: s_load_dwordx8 s[0:7], s[8:9], 0x0 +; GCN: s_load_dwordx4 s[0:3], s[8:9], 0x0 define amdgpu_kernel void @v5i32_arg(<5 x i32> %in) nounwind { store <5 x i32> %in, ptr addrspace(1) null ret void } ; FUNC-LABEL: {{^}}v6i32_arg: -; GCN: s_load_dwordx8 s[0:7], s[8:9], 0x0 +; GCN: s_load_dwordx4 s[0:3], s[8:9], 0x0 define amdgpu_kernel void @v6i32_arg(<6 x i32> %in) nounwind { store <6 x i32> %in, ptr addrspace(1) null ret void diff --git a/llvm/test/CodeGen/ARM/vpadd.ll b/llvm/test/CodeGen/ARM/vpadd.ll index 0597d44bc7cbf..a98eabc63ef2a 100644 --- a/llvm/test/CodeGen/ARM/vpadd.ll +++ b/llvm/test/CodeGen/ARM/vpadd.ll @@ -368,7 +368,7 @@ define void @addCombineToVPADDL_u8(ptr %cbcr, ptr %X) nounwind ssp { define void @addCombineToVPADDL_u8_early_zext(ptr %cbcr, ptr %X) nounwind ssp { ; CHECK-LABEL: addCombineToVPADDL_u8_early_zext: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vld1.8 {d16}, [r0:64] ; CHECK-NEXT: vmovl.u8 q8, d16 ; CHECK-NEXT: vpadd.i16 d16, d16, d17 ; CHECK-NEXT: vstr d16, [r1] diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll index d60ce408278da..692a7ce0b20e8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll @@ -1185,7 +1185,7 @@ define double @extractelt_nxv8f64_idx( %v, i32 zeroext %idx define void @store_extractelt_nxv8f64(ptr %x, ptr %p) { ; CHECK-LABEL: store_extractelt_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vl8re64.v v8, (a0) +; CHECK-NEXT: vl1re64.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 1 ; CHECK-NEXT: vse64.v v8, (a1) @@ -1199,7 +1199,7 @@ define void @store_extractelt_nxv8f64(ptr %x, ptr %p) { define void @store_vfmv_f_s_nxv8f64(ptr %x, ptr %p) { ; CHECK-LABEL: store_vfmv_f_s_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vl8re64.v v8, (a0) +; CHECK-NEXT: vl1re64.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll index e9dca2c42e835..e2711a0231509 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll @@ -1,26 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,CHECK32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,CHECK64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,CHECK32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,CHECK64 -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,CHECK32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,CHECK64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,CHECK32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,CHECK64 -; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s -; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,VLS,CHECK32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,VLS,CHECK64 +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,VLS,CHECK32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,VLS,CHECK64 define void @extract_v2i8_v4i8_0(ptr %x, ptr %y) { ; CHECK-LABEL: extract_v2i8_v4i8_0: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: lh a0, 0(a0) +; CHECK-NEXT: sh a0, 0(a1) ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %c = call <2 x i8> @llvm.vector.extract.v2i8.v4i8(<4 x i8> %a, i64 0) @@ -31,12 +29,8 @@ define void @extract_v2i8_v4i8_0(ptr %x, ptr %y) { define void @extract_v2i8_v4i8_2(ptr %x, ptr %y) { ; CHECK-LABEL: extract_v2i8_v4i8_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: lh a0, 2(a0) +; CHECK-NEXT: sh a0, 0(a1) ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %c = call <2 x i8> @llvm.vector.extract.v2i8.v4i8(<4 x i8> %a, i64 2) @@ -47,10 +41,8 @@ define void @extract_v2i8_v4i8_2(ptr %x, ptr %y) { define void @extract_v2i8_v8i8_0(ptr %x, ptr %y) { ; CHECK-LABEL: extract_v2i8_v8i8_0: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: lh a0, 0(a0) +; CHECK-NEXT: sh a0, 0(a1) ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %c = call <2 x i8> @llvm.vector.extract.v2i8.v8i8(<8 x i8> %a, i64 0) @@ -61,12 +53,8 @@ define void @extract_v2i8_v8i8_0(ptr %x, ptr %y) { define void @extract_v2i8_v8i8_6(ptr %x, ptr %y) { ; CHECK-LABEL: extract_v2i8_v8i8_6: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 2, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 6 -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: lh a0, 6(a0) +; CHECK-NEXT: sh a0, 0(a1) ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %c = call <2 x i8> @llvm.vector.extract.v2i8.v8i8(<8 x i8> %a, i64 6) @@ -75,21 +63,11 @@ define void @extract_v2i8_v8i8_6(ptr %x, ptr %y) { } define void @extract_v1i32_v8i32_4(ptr %x, ptr %y) { -; VLA-LABEL: extract_v1i32_v8i32_4: -; VLA: # %bb.0: -; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLA-NEXT: vle32.v v8, (a0) -; VLA-NEXT: vslidedown.vi v8, v8, 4 -; VLA-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; VLA-NEXT: vse32.v v8, (a1) -; VLA-NEXT: ret -; -; VLS-LABEL: extract_v1i32_v8i32_4: -; VLS: # %bb.0: -; VLS-NEXT: vl2re32.v v8, (a0) -; VLS-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; VLS-NEXT: vse32.v v9, (a1) -; VLS-NEXT: ret +; CHECK-LABEL: extract_v1i32_v8i32_4: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a0, 16(a0) +; CHECK-NEXT: sw a0, 0(a1) +; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %c = call <1 x i32> @llvm.vector.extract.v1i32.v8i32(<8 x i32> %a, i64 4) store <1 x i32> %c, ptr %y @@ -97,23 +75,11 @@ define void @extract_v1i32_v8i32_4(ptr %x, ptr %y) { } define void @extract_v1i32_v8i32_5(ptr %x, ptr %y) { -; VLA-LABEL: extract_v1i32_v8i32_5: -; VLA: # %bb.0: -; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLA-NEXT: vle32.v v8, (a0) -; VLA-NEXT: vslidedown.vi v8, v8, 5 -; VLA-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; VLA-NEXT: vse32.v v8, (a1) -; VLA-NEXT: ret -; -; VLS-LABEL: extract_v1i32_v8i32_5: -; VLS: # %bb.0: -; VLS-NEXT: vl2re32.v v8, (a0) -; VLS-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; VLS-NEXT: vslidedown.vi v8, v9, 1 -; VLS-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; VLS-NEXT: vse32.v v8, (a1) -; VLS-NEXT: ret +; CHECK-LABEL: extract_v1i32_v8i32_5: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a0, 20(a0) +; CHECK-NEXT: sw a0, 0(a1) +; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %c = call <1 x i32> @llvm.vector.extract.v1i32.v8i32(<8 x i32> %a, i64 5) store <1 x i32> %c, ptr %y @@ -121,20 +87,18 @@ define void @extract_v1i32_v8i32_5(ptr %x, ptr %y) { } define void @extract_v2i32_v8i32_0(ptr %x, ptr %y) { -; VLA-LABEL: extract_v2i32_v8i32_0: -; VLA: # %bb.0: -; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLA-NEXT: vle32.v v8, (a0) -; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; VLA-NEXT: vse32.v v8, (a1) -; VLA-NEXT: ret +; CHECK32-LABEL: extract_v2i32_v8i32_0: +; CHECK32: # %bb.0: +; CHECK32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK32-NEXT: vle32.v v8, (a0) +; CHECK32-NEXT: vse32.v v8, (a1) +; CHECK32-NEXT: ret ; -; VLS-LABEL: extract_v2i32_v8i32_0: -; VLS: # %bb.0: -; VLS-NEXT: vl2re32.v v8, (a0) -; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; VLS-NEXT: vse32.v v8, (a1) -; VLS-NEXT: ret +; CHECK64-LABEL: extract_v2i32_v8i32_0: +; CHECK64: # %bb.0: +; CHECK64-NEXT: ld a0, 0(a0) +; CHECK64-NEXT: sd a0, 0(a1) +; CHECK64-NEXT: ret %a = load <8 x i32>, ptr %x %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 0) store <2 x i32> %c, ptr %y @@ -142,24 +106,19 @@ define void @extract_v2i32_v8i32_0(ptr %x, ptr %y) { } define void @extract_v2i32_v8i32_2(ptr %x, ptr %y) { -; VLA-LABEL: extract_v2i32_v8i32_2: -; VLA: # %bb.0: -; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLA-NEXT: vle32.v v8, (a0) -; VLA-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; VLA-NEXT: vslidedown.vi v8, v8, 2 -; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; VLA-NEXT: vse32.v v8, (a1) -; VLA-NEXT: ret +; CHECK32-LABEL: extract_v2i32_v8i32_2: +; CHECK32: # %bb.0: +; CHECK32-NEXT: addi a0, a0, 8 +; CHECK32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK32-NEXT: vle32.v v8, (a0) +; CHECK32-NEXT: vse32.v v8, (a1) +; CHECK32-NEXT: ret ; -; VLS-LABEL: extract_v2i32_v8i32_2: -; VLS: # %bb.0: -; VLS-NEXT: vl2re32.v v8, (a0) -; VLS-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; VLS-NEXT: vslidedown.vi v8, v8, 2 -; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; VLS-NEXT: vse32.v v8, (a1) -; VLS-NEXT: ret +; CHECK64-LABEL: extract_v2i32_v8i32_2: +; CHECK64: # %bb.0: +; CHECK64-NEXT: ld a0, 8(a0) +; CHECK64-NEXT: sd a0, 0(a1) +; CHECK64-NEXT: ret %a = load <8 x i32>, ptr %x %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 2) store <2 x i32> %c, ptr %y @@ -167,22 +126,19 @@ define void @extract_v2i32_v8i32_2(ptr %x, ptr %y) { } define void @extract_v2i32_v8i32_4(ptr %x, ptr %y) { -; VLA-LABEL: extract_v2i32_v8i32_4: -; VLA: # %bb.0: -; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLA-NEXT: vle32.v v8, (a0) -; VLA-NEXT: vsetivli zero, 2, e32, m2, ta, ma -; VLA-NEXT: vslidedown.vi v8, v8, 4 -; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; VLA-NEXT: vse32.v v8, (a1) -; VLA-NEXT: ret +; CHECK32-LABEL: extract_v2i32_v8i32_4: +; CHECK32: # %bb.0: +; CHECK32-NEXT: addi a0, a0, 16 +; CHECK32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK32-NEXT: vle32.v v8, (a0) +; CHECK32-NEXT: vse32.v v8, (a1) +; CHECK32-NEXT: ret ; -; VLS-LABEL: extract_v2i32_v8i32_4: -; VLS: # %bb.0: -; VLS-NEXT: vl2re32.v v8, (a0) -; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; VLS-NEXT: vse32.v v9, (a1) -; VLS-NEXT: ret +; CHECK64-LABEL: extract_v2i32_v8i32_4: +; CHECK64: # %bb.0: +; CHECK64-NEXT: ld a0, 16(a0) +; CHECK64-NEXT: sd a0, 0(a1) +; CHECK64-NEXT: ret %a = load <8 x i32>, ptr %x %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 4) store <2 x i32> %c, ptr %y @@ -190,24 +146,19 @@ define void @extract_v2i32_v8i32_4(ptr %x, ptr %y) { } define void @extract_v2i32_v8i32_6(ptr %x, ptr %y) { -; VLA-LABEL: extract_v2i32_v8i32_6: -; VLA: # %bb.0: -; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLA-NEXT: vle32.v v8, (a0) -; VLA-NEXT: vsetivli zero, 2, e32, m2, ta, ma -; VLA-NEXT: vslidedown.vi v8, v8, 6 -; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; VLA-NEXT: vse32.v v8, (a1) -; VLA-NEXT: ret +; CHECK32-LABEL: extract_v2i32_v8i32_6: +; CHECK32: # %bb.0: +; CHECK32-NEXT: addi a0, a0, 24 +; CHECK32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK32-NEXT: vle32.v v8, (a0) +; CHECK32-NEXT: vse32.v v8, (a1) +; CHECK32-NEXT: ret ; -; VLS-LABEL: extract_v2i32_v8i32_6: -; VLS: # %bb.0: -; VLS-NEXT: vl2re32.v v8, (a0) -; VLS-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; VLS-NEXT: vslidedown.vi v8, v9, 2 -; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; VLS-NEXT: vse32.v v8, (a1) -; VLS-NEXT: ret +; CHECK64-LABEL: extract_v2i32_v8i32_6: +; CHECK64: # %bb.0: +; CHECK64-NEXT: ld a0, 24(a0) +; CHECK64-NEXT: sd a0, 0(a1) +; CHECK64-NEXT: ret %a = load <8 x i32>, ptr %x %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 6) store <2 x i32> %c, ptr %y @@ -367,22 +318,11 @@ define void @extract_v8i32_nxv16i32_8( %x, ptr %y) { } define void @extract_v8i1_v64i1_0(ptr %x, ptr %y) { -; VLA-LABEL: extract_v8i1_v64i1_0: -; VLA: # %bb.0: -; VLA-NEXT: li a2, 64 -; VLA-NEXT: vsetvli zero, a2, e8, m4, ta, ma -; VLA-NEXT: vlm.v v8, (a0) -; VLA-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; VLA-NEXT: vsm.v v8, (a1) -; VLA-NEXT: ret -; -; VLS-LABEL: extract_v8i1_v64i1_0: -; VLS: # %bb.0: -; VLS-NEXT: vsetvli a2, zero, e8, m4, ta, ma -; VLS-NEXT: vlm.v v8, (a0) -; VLS-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; VLS-NEXT: vsm.v v8, (a1) -; VLS-NEXT: ret +; CHECK-LABEL: extract_v8i1_v64i1_0: +; CHECK: # %bb.0: +; CHECK-NEXT: lbu a0, 0(a0) +; CHECK-NEXT: sb a0, 0(a1) +; CHECK-NEXT: ret %a = load <64 x i1>, ptr %x %c = call <8 x i1> @llvm.vector.extract.v8i1.v64i1(<64 x i1> %a, i64 0) store <8 x i1> %c, ptr %y @@ -390,24 +330,11 @@ define void @extract_v8i1_v64i1_0(ptr %x, ptr %y) { } define void @extract_v8i1_v64i1_8(ptr %x, ptr %y) { -; VLA-LABEL: extract_v8i1_v64i1_8: -; VLA: # %bb.0: -; VLA-NEXT: li a2, 64 -; VLA-NEXT: vsetvli zero, a2, e8, m4, ta, ma -; VLA-NEXT: vlm.v v8, (a0) -; VLA-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; VLA-NEXT: vslidedown.vi v8, v8, 1 -; VLA-NEXT: vsm.v v8, (a1) -; VLA-NEXT: ret -; -; VLS-LABEL: extract_v8i1_v64i1_8: -; VLS: # %bb.0: -; VLS-NEXT: vsetvli a2, zero, e8, m4, ta, ma -; VLS-NEXT: vlm.v v8, (a0) -; VLS-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; VLS-NEXT: vslidedown.vi v8, v8, 1 -; VLS-NEXT: vsm.v v8, (a1) -; VLS-NEXT: ret +; CHECK-LABEL: extract_v8i1_v64i1_8: +; CHECK: # %bb.0: +; CHECK-NEXT: lbu a0, 1(a0) +; CHECK-NEXT: sb a0, 0(a1) +; CHECK-NEXT: ret %a = load <64 x i1>, ptr %x %c = call <8 x i1> @llvm.vector.extract.v8i1.v64i1(<64 x i1> %a, i64 8) store <8 x i1> %c, ptr %y @@ -415,24 +342,11 @@ define void @extract_v8i1_v64i1_8(ptr %x, ptr %y) { } define void @extract_v8i1_v64i1_48(ptr %x, ptr %y) { -; VLA-LABEL: extract_v8i1_v64i1_48: -; VLA: # %bb.0: -; VLA-NEXT: li a2, 64 -; VLA-NEXT: vsetvli zero, a2, e8, m4, ta, ma -; VLA-NEXT: vlm.v v8, (a0) -; VLA-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; VLA-NEXT: vslidedown.vi v8, v8, 6 -; VLA-NEXT: vsm.v v8, (a1) -; VLA-NEXT: ret -; -; VLS-LABEL: extract_v8i1_v64i1_48: -; VLS: # %bb.0: -; VLS-NEXT: vsetvli a2, zero, e8, m4, ta, ma -; VLS-NEXT: vlm.v v8, (a0) -; VLS-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; VLS-NEXT: vslidedown.vi v8, v8, 6 -; VLS-NEXT: vsm.v v8, (a1) -; VLS-NEXT: ret +; CHECK-LABEL: extract_v8i1_v64i1_48: +; CHECK: # %bb.0: +; CHECK-NEXT: lbu a0, 6(a0) +; CHECK-NEXT: sb a0, 0(a1) +; CHECK-NEXT: ret %a = load <64 x i1>, ptr %x %c = call <8 x i1> @llvm.vector.extract.v8i1.v64i1(<64 x i1> %a, i64 48) store <8 x i1> %c, ptr %y @@ -875,10 +789,8 @@ define <1 x i64> @extract_v1i64_v2i64_1(<2 x i64> %x) { define void @extract_v2bf16_v4bf16_0(ptr %x, ptr %y) { ; CHECK-LABEL: extract_v2bf16_v4bf16_0: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: lw a0, 0(a0) +; CHECK-NEXT: sw a0, 0(a1) ; CHECK-NEXT: ret %a = load <4 x bfloat>, ptr %x %c = call <2 x bfloat> @llvm.vector.extract.v2bf16.v4bf16(<4 x bfloat> %a, i64 0) @@ -889,12 +801,8 @@ define void @extract_v2bf16_v4bf16_0(ptr %x, ptr %y) { define void @extract_v2bf16_v4bf16_2(ptr %x, ptr %y) { ; CHECK-LABEL: extract_v2bf16_v4bf16_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: lw a0, 4(a0) +; CHECK-NEXT: sw a0, 0(a1) ; CHECK-NEXT: ret %a = load <4 x bfloat>, ptr %x %c = call <2 x bfloat> @llvm.vector.extract.v2bf16.v4bf16(<4 x bfloat> %a, i64 2) @@ -905,10 +813,8 @@ define void @extract_v2bf16_v4bf16_2(ptr %x, ptr %y) { define void @extract_v2f16_v4f16_0(ptr %x, ptr %y) { ; CHECK-LABEL: extract_v2f16_v4f16_0: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: lw a0, 0(a0) +; CHECK-NEXT: sw a0, 0(a1) ; CHECK-NEXT: ret %a = load <4 x half>, ptr %x %c = call <2 x half> @llvm.vector.extract.v2f16.v4f16(<4 x half> %a, i64 0) @@ -919,12 +825,8 @@ define void @extract_v2f16_v4f16_0(ptr %x, ptr %y) { define void @extract_v2f16_v4f16_2(ptr %x, ptr %y) { ; CHECK-LABEL: extract_v2f16_v4f16_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: lw a0, 4(a0) +; CHECK-NEXT: sw a0, 0(a1) ; CHECK-NEXT: ret %a = load <4 x half>, ptr %x %c = call <2 x half> @llvm.vector.extract.v2f16.v4f16(<4 x half> %a, i64 2)