diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 521829675ae7c..6c8e9969784c9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23055,18 +23055,29 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { // ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt --> // trunc i64 X to i32 SDValue X = BCSrc.getOperand(0); - assert(X.getValueType().isScalarInteger() && ScalarVT.isScalarInteger() && + EVT XVT = X.getValueType(); + assert(XVT.isScalarInteger() && ScalarVT.isScalarInteger() && "Extract element and scalar to vector can't change element type " "from FP to integer."); unsigned XBitWidth = X.getValueSizeInBits(); - BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1; + unsigned Scale = XBitWidth / VecEltBitWidth; + BCTruncElt = IsLE ? 0 : Scale - 1; // An extract element return value type can be wider than its vector // operand element type. In that case, the high bits are undefined, so // it's possible that we may need to extend rather than truncate. - if (ExtractIndex == BCTruncElt && XBitWidth > VecEltBitWidth) { + if (ExtractIndex < Scale && XBitWidth > VecEltBitWidth) { assert(XBitWidth % VecEltBitWidth == 0 && "Scalar bitwidth must be a multiple of vector element bitwidth"); + + if (ExtractIndex != BCTruncElt) { + unsigned ShiftIndex = + IsLE ? ExtractIndex : (Scale - 1) - ExtractIndex; + X = DAG.getNode( + ISD::SRL, DL, XVT, X, + DAG.getShiftAmountConstant(ShiftIndex * VecEltBitWidth, XVT, DL)); + } + return DAG.getAnyExtOrTrunc(X, DL, ScalarVT); } } diff --git a/llvm/test/CodeGen/AArch64/extract-insert.ll b/llvm/test/CodeGen/AArch64/extract-insert.ll index 077e5f3d042df..8c133d76ce317 100644 --- a/llvm/test/CodeGen/AArch64/extract-insert.ll +++ b/llvm/test/CodeGen/AArch64/extract-insert.ll @@ -5,9 +5,8 @@ define i32 @trunc_i64_to_i32_le(i64 %x) { ; BE-LABEL: trunc_i64_to_i32_le: ; BE: // %bb.0: -; BE-NEXT: fmov d0, x0 -; BE-NEXT: rev64 v0.4s, v0.4s -; BE-NEXT: fmov w0, s0 +; BE-NEXT: lsr x0, x0, #32 +; BE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; BE-NEXT: ret ; ; LE-LABEL: trunc_i64_to_i32_le: @@ -28,8 +27,8 @@ define i32 @trunc_i64_to_i32_be(i64 %x) { ; ; LE-LABEL: trunc_i64_to_i32_be: ; LE: // %bb.0: -; LE-NEXT: fmov d0, x0 -; LE-NEXT: mov w0, v0.s[1] +; LE-NEXT: lsr x0, x0, #32 +; LE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; LE-NEXT: ret %ins = insertelement <2 x i64> undef, i64 %x, i32 0 %bc = bitcast <2 x i64> %ins to <4 x i32> @@ -40,9 +39,8 @@ define i32 @trunc_i64_to_i32_be(i64 %x) { define i16 @trunc_i64_to_i16_le(i64 %x) { ; BE-LABEL: trunc_i64_to_i16_le: ; BE: // %bb.0: -; BE-NEXT: fmov d0, x0 -; BE-NEXT: rev64 v0.8h, v0.8h -; BE-NEXT: umov w0, v0.h[0] +; BE-NEXT: lsr x0, x0, #48 +; BE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; BE-NEXT: ret ; ; LE-LABEL: trunc_i64_to_i16_le: @@ -63,8 +61,8 @@ define i16 @trunc_i64_to_i16_be(i64 %x) { ; ; LE-LABEL: trunc_i64_to_i16_be: ; LE: // %bb.0: -; LE-NEXT: fmov d0, x0 -; LE-NEXT: umov w0, v0.h[3] +; LE-NEXT: lsr x0, x0, #48 +; LE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; LE-NEXT: ret %ins = insertelement <2 x i64> undef, i64 %x, i32 0 %bc = bitcast <2 x i64> %ins to <8 x i16> @@ -75,9 +73,7 @@ define i16 @trunc_i64_to_i16_be(i64 %x) { define i8 @trunc_i32_to_i8_le(i32 %x) { ; BE-LABEL: trunc_i32_to_i8_le: ; BE: // %bb.0: -; BE-NEXT: fmov s0, w0 -; BE-NEXT: rev32 v0.16b, v0.16b -; BE-NEXT: umov w0, v0.b[0] +; BE-NEXT: lsr w0, w0, #24 ; BE-NEXT: ret ; ; LE-LABEL: trunc_i32_to_i8_le: @@ -96,8 +92,7 @@ define i8 @trunc_i32_to_i8_be(i32 %x) { ; ; LE-LABEL: trunc_i32_to_i8_be: ; LE: // %bb.0: -; LE-NEXT: fmov s0, w0 -; LE-NEXT: umov w0, v0.b[3] +; LE-NEXT: lsr w0, w0, #24 ; LE-NEXT: ret %ins = insertelement <4 x i32> undef, i32 %x, i32 0 %bc = bitcast <4 x i32> %ins to <16 x i8> @@ -115,8 +110,8 @@ define i8 @trunc_i64_to_i8_be(i64 %x) { ; ; LE-LABEL: trunc_i64_to_i8_be: ; LE: // %bb.0: -; LE-NEXT: fmov d0, x0 -; LE-NEXT: umov w0, v0.b[7] +; LE-NEXT: lsr x0, x0, #56 +; LE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; LE-NEXT: ret %ins = insertelement <3 x i64> undef, i64 %x, i32 0 %bc = bitcast <3 x i64> %ins to <24 x i8> diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 66c884e95fa47..b52cbfe08156b 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -358,11 +358,10 @@ define void @store_trunc_from_64bits(ptr %src, ptr %dst) { ; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: add x9, x0, #4 ; CHECK-NEXT: ld1r.4h { v0 }, [x9] -; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: lsr w9, w8, #16 ; CHECK-NEXT: strb w8, [x1] -; CHECK-NEXT: add x8, x1, #1 -; CHECK-NEXT: st1.b { v1 }[2], [x8] ; CHECK-NEXT: add x8, x1, #2 +; CHECK-NEXT: strb w9, [x1, #1] ; CHECK-NEXT: st1.b { v0 }[4], [x8] ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll b/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll index c1059be946a5f..b6799c8a88e0c 100644 --- a/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll +++ b/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll @@ -16,10 +16,8 @@ define i8 @scalar_to_vector_half(ptr nocapture readonly %ad) { ; ; P9BE-LABEL: scalar_to_vector_half: ; P9BE: # %bb.0: # %entry -; P9BE-NEXT: lxsihzx v2, 0, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: vsplth v2, v2, 3 -; P9BE-NEXT: vextublx r3, r3, v2 +; P9BE-NEXT: lhz r3, 0(r3) +; P9BE-NEXT: srwi r3, r3, 24 ; P9BE-NEXT: blr ; ; P8LE-LABEL: scalar_to_vector_half: @@ -30,10 +28,7 @@ define i8 @scalar_to_vector_half(ptr nocapture readonly %ad) { ; P8BE-LABEL: scalar_to_vector_half: ; P8BE: # %bb.0: # %entry ; P8BE-NEXT: lhz r3, 0(r3) -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtfprd f0, r3 -; P8BE-NEXT: mffprd r3, f0 -; P8BE-NEXT: rldicl r3, r3, 8, 56 +; P8BE-NEXT: srwi r3, r3, 24 ; P8BE-NEXT: blr entry: %0 = load <2 x i8>, ptr %ad, align 1 diff --git a/llvm/test/CodeGen/X86/load-partial.ll b/llvm/test/CodeGen/X86/load-partial.ll index bec6cce889142..dba63582ff08b 100644 --- a/llvm/test/CodeGen/X86/load-partial.ll +++ b/llvm/test/CodeGen/X86/load-partial.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX ; ; Partial Vector Loads - PR16739 @@ -382,38 +382,24 @@ define dso_local i32 @load_partial_illegal_type() { define dso_local void @PR43227(ptr %explicit_0, ptr %explicit_1) { ; SSE-LABEL: PR43227: ; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: psrlq $32, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm1, 672(%rsi) -; SSE-NEXT: movdqa %xmm0, 688(%rsi) +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movaps %xmm0, 672(%rsi) +; SSE-NEXT: movaps %xmm1, 688(%rsi) ; SSE-NEXT: retq ; -; AVX1-LABEL: PR43227: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, 672(%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: PR43227: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 672(%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: PR43227: +; AVX: # %bb.0: +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: vmovaps %ymm0, 672(%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = getelementptr i32, ptr %explicit_0, i64 63 %2 = load <3 x i32>, ptr %1, align 1 %3 = shufflevector <3 x i32> %2, <3 x i32> undef, <2 x i32>