Skip to content

Commit 201f07d

Browse files
committed
[TTI][X86] getMemoryOpCost - reduced costs when loading uniform values due to value reuse
Similar to what we do for broadcast shuffles, when legalising load costs, if the value is known to be uniform, then we will only load a single vector and reuse this across the split legalised registers. Fixes #111126
1 parent 026fbe5 commit 201f07d

File tree

2 files changed

+28
-46
lines changed

2 files changed

+28
-46
lines changed

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5237,6 +5237,23 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
52375237
CurrOpSizeBytes != 1)
52385238
break; // Try smalled vector size.
52395239

5240+
// This isn't exactly right. We're using slow unaligned 32-byte accesses
5241+
// as a proxy for a double-pumped AVX memory interface such as on
5242+
// Sandybridge.
5243+
// Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5244+
// will be scalarized.
5245+
if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5246+
Cost += 2;
5247+
else if (CurrOpSizeBytes < 4)
5248+
Cost += 2;
5249+
else
5250+
Cost += 1;
5251+
5252+
// If we're loading a uniform value, then we don't need to split the load,
5253+
// loading just a single (widest) vector can be reused by all splits.
5254+
if (IsLoad && OpInfo.isUniform())
5255+
return Cost;
5256+
52405257
bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
52415258

52425259
// If we have fully processed the previous reg, we need to replenish it.
@@ -5265,18 +5282,6 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
52655282
!IsLoad, CostKind);
52665283
}
52675284

5268-
// This isn't exactly right. We're using slow unaligned 32-byte accesses
5269-
// as a proxy for a double-pumped AVX memory interface such as on
5270-
// Sandybridge.
5271-
// Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5272-
// will be scalarized.
5273-
if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5274-
Cost += 2;
5275-
else if (CurrOpSizeBytes < 4)
5276-
Cost += 2;
5277-
else
5278-
Cost += 1;
5279-
52805285
SubVecEltsLeft -= CurrNumEltPerOp;
52815286
NumEltRemaining -= CurrNumEltPerOp;
52825287
Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);

llvm/test/Transforms/SLPVectorizer/X86/store-constant.ll

Lines changed: 11 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,19 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE
3-
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
4-
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX
5-
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX
2+
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64 | FileCheck %s
3+
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v2 | FileCheck %s
4+
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v3 | FileCheck %s
5+
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v4 | FileCheck %s
66

77
@arr = global [20 x i64] zeroinitializer, align 16
88

99
define void @PR111126() {
10-
; SSE-LABEL: @PR111126(
11-
; SSE-NEXT: store i64 1, ptr @arr, align 16
12-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 8), align 8
13-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 16), align 16
14-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 24), align 8
15-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 32), align 16
16-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 40), align 8
17-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 48), align 16
18-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 56), align 8
19-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 64), align 16
20-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 72), align 8
21-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 80), align 16
22-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 88), align 8
23-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 96), align 16
24-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 104), align 8
25-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 112), align 16
26-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 120), align 8
27-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 128), align 16
28-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 136), align 8
29-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 144), align 16
30-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 152), align 8
31-
; SSE-NEXT: ret void
32-
;
33-
; AVX-LABEL: @PR111126(
34-
; AVX-NEXT: store <4 x i64> splat (i64 1), ptr @arr, align 16
35-
; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 32), align 16
36-
; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 64), align 16
37-
; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 96), align 16
38-
; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 128), align 16
39-
; AVX-NEXT: ret void
10+
; CHECK-LABEL: @PR111126(
11+
; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr @arr, align 16
12+
; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 32), align 16
13+
; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 64), align 16
14+
; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 96), align 16
15+
; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 128), align 16
16+
; CHECK-NEXT: ret void
4017
;
4118
store i64 1, ptr @arr, align 16
4219
store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 8), align 8

0 commit comments

Comments
 (0)