diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 3b424bbb53e5b..cae72c4210352 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -5237,6 +5237,23 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, CurrOpSizeBytes != 1) break; // Try smalled vector size. + // This isn't exactly right. We're using slow unaligned 32-byte accesses + // as a proxy for a double-pumped AVX memory interface such as on + // Sandybridge. + // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or + // will be scalarized. + if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow()) + Cost += 2; + else if (CurrOpSizeBytes < 4) + Cost += 2; + else + Cost += 1; + + // If we're loading a uniform value, then we don't need to split the load, + // loading just a single (widest) vector can be reused by all splits. + if (IsLoad && OpInfo.isUniform()) + return Cost; + bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0; // If we have fully processed the previous reg, we need to replenish it. @@ -5265,18 +5282,6 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, !IsLoad, CostKind); } - // This isn't exactly right. We're using slow unaligned 32-byte accesses - // as a proxy for a double-pumped AVX memory interface such as on - // Sandybridge. - // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or - // will be scalarized. - if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow()) - Cost += 2; - else if (CurrOpSizeBytes < 4) - Cost += 2; - else - Cost += 1; - SubVecEltsLeft -= CurrNumEltPerOp; NumEltRemaining -= CurrNumEltPerOp; Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/store-constant.ll b/llvm/test/Transforms/SLPVectorizer/X86/store-constant.ll index 15c878daff26b..0b5e279dea5ba 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/store-constant.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/store-constant.ll @@ -1,42 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64 | FileCheck %s +; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v2 | FileCheck %s +; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v3 | FileCheck %s +; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v4 | FileCheck %s @arr = global [20 x i64] zeroinitializer, align 16 define void @PR111126() { -; SSE-LABEL: @PR111126( -; SSE-NEXT: store i64 1, ptr @arr, align 16 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 8), align 8 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 16), align 16 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 24), align 8 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 32), align 16 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 40), align 8 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 48), align 16 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 56), align 8 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 64), align 16 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 72), align 8 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 80), align 16 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 88), align 8 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 96), align 16 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 104), align 8 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 112), align 16 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 120), align 8 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 128), align 16 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 136), align 8 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 144), align 16 -; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 152), align 8 -; SSE-NEXT: ret void -; -; AVX-LABEL: @PR111126( -; AVX-NEXT: store <4 x i64> splat (i64 1), ptr @arr, align 16 -; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 32), align 16 -; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 64), align 16 -; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 96), align 16 -; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 128), align 16 -; AVX-NEXT: ret void +; CHECK-LABEL: @PR111126( +; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr @arr, align 16 +; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 32), align 16 +; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 64), align 16 +; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 96), align 16 +; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 128), align 16 +; CHECK-NEXT: ret void ; store i64 1, ptr @arr, align 16 store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 8), align 8