Skip to content

Commit 94ef465

Browse files
committed
LAA: scale strides using type-size
Change getDependenceDistanceStrideAndSize to scale strides by TypeByteSize, scaling the returned CommonStride and MaxStride. We now correctly detect that there is indeed a common stride, when we failed to previously due to differing type sizes.
1 parent 3a4376b commit 94ef465

File tree

2 files changed

+32
-30
lines changed

2 files changed

+32
-30
lines changed

llvm/include/llvm/Analysis/LoopAccessAnalysis.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,7 @@ class MemoryDepChecker {
370370
/// Strides could either be scaled (in bytes, taking the size of the
371371
/// underlying type into account), or unscaled (in indexing units; unscaled
372372
/// stride = scaled stride / size of underlying type). Here, strides are
373-
/// unscaled.
373+
/// scaled.
374374
uint64_t MaxStride;
375375
std::optional<uint64_t> CommonStride;
376376

llvm/lib/Analysis/LoopAccessAnalysis.cpp

Lines changed: 31 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1789,8 +1789,7 @@ void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
17891789
/// }
17901790
static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
17911791
const SCEV &MaxBTC, const SCEV &Dist,
1792-
uint64_t MaxStride,
1793-
uint64_t TypeByteSize) {
1792+
uint64_t MaxStride) {
17941793

17951794
// If we can prove that
17961795
// (**) |Dist| > MaxBTC * Step
@@ -1809,8 +1808,7 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
18091808
// will be executed only if LoopCount >= VF, proving distance >= LoopCount
18101809
// also guarantees that distance >= VF.
18111810
//
1812-
const uint64_t ByteStride = MaxStride * TypeByteSize;
1813-
const SCEV *Step = SE.getConstant(MaxBTC.getType(), ByteStride);
1811+
const SCEV *Step = SE.getConstant(MaxBTC.getType(), MaxStride);
18141812
const SCEV *Product = SE.getMulExpr(&MaxBTC, Step);
18151813

18161814
const SCEV *CastedDist = &Dist;
@@ -1854,25 +1852,23 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
18541852
if (Distance % TypeByteSize)
18551853
return false;
18561854

1857-
uint64_t ScaledDist = Distance / TypeByteSize;
1858-
1859-
// No dependence if the scaled distance is not multiple of the stride.
1855+
// No dependence if the distance is not multiple of the stride.
18601856
// E.g.
18611857
// for (i = 0; i < 1024 ; i += 4)
18621858
// A[i+2] = A[i] + 1;
18631859
//
1864-
// Two accesses in memory (scaled distance is 2, stride is 4):
1860+
// Two accesses in memory (distance is 2, stride is 4):
18651861
// | A[0] | | | | A[4] | | | |
18661862
// | | | A[2] | | | | A[6] | |
18671863
//
18681864
// E.g.
18691865
// for (i = 0; i < 1024 ; i += 3)
18701866
// A[i+4] = A[i] + 1;
18711867
//
1872-
// Two accesses in memory (scaled distance is 4, stride is 3):
1868+
// Two accesses in memory (distance is 4, stride is 3):
18731869
// | A[0] | | | A[3] | | | A[6] | | |
18741870
// | | | | | A[4] | | | A[7] | |
1875-
return ScaledDist % Stride;
1871+
return Distance % Stride;
18761872
}
18771873

18781874
std::variant<MemoryDepChecker::Dependence::DepType,
@@ -1981,25 +1977,32 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
19811977
return MemoryDepChecker::Dependence::Unknown;
19821978
}
19831979

1984-
uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
1985-
bool HasSameSize =
1986-
DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
1987-
if (!HasSameSize)
1988-
TypeByteSize = 0;
1980+
TypeSize AStoreSz = DL.getTypeStoreSize(ATy),
1981+
BStoreSz = DL.getTypeStoreSize(BTy);
1982+
1983+
// Fail early if either store size is scalable.
1984+
if (AStoreSz.isScalable() || BStoreSz.isScalable())
1985+
return MemoryDepChecker::Dependence::Unknown;
1986+
1987+
// If store sizes are not the same, set TypeByteSize to zero, so we can check
1988+
// it in the caller.
1989+
uint64_t ASz = alignTo(AStoreSz, DL.getABITypeAlign(ATy)),
1990+
BSz = alignTo(BStoreSz, DL.getABITypeAlign(BTy)),
1991+
TypeByteSize = AStoreSz == BStoreSz ? BSz : 0;
19891992

1990-
StrideAPtrInt = std::abs(StrideAPtrInt);
1991-
StrideBPtrInt = std::abs(StrideBPtrInt);
1993+
uint64_t StrideAScaled = std::abs(StrideAPtrInt) * ASz;
1994+
uint64_t StrideBScaled = std::abs(StrideBPtrInt) * BSz;
19921995

1993-
uint64_t MaxStride = std::max(StrideAPtrInt, StrideBPtrInt);
1996+
uint64_t MaxStride = std::max(StrideAScaled, StrideBScaled);
19941997

19951998
std::optional<uint64_t> CommonStride;
1996-
if (StrideAPtrInt == StrideBPtrInt)
1997-
CommonStride = StrideAPtrInt;
1999+
if (StrideAScaled == StrideBScaled)
2000+
CommonStride = StrideAScaled;
19982001

19992002
// TODO: Historically, we don't retry with runtime checks unless the
20002003
// (unscaled) strides are the same. Fix this once the condition for runtime
20012004
// checks in isDependent is fixed.
2002-
bool ShouldRetryWithRuntimeCheck = CommonStride.has_value();
2005+
bool ShouldRetryWithRuntimeCheck = StrideAPtrInt == StrideBPtrInt;
20032006

20042007
return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride,
20052008
ShouldRetryWithRuntimeCheck, TypeByteSize,
@@ -2039,9 +2042,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
20392042
// upper bound of the number of iterations), the accesses are independet, i.e.
20402043
// they are far enough appart that accesses won't access the same location
20412044
// across all loop ierations.
2042-
if (HasSameSize && isSafeDependenceDistance(
2043-
DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()),
2044-
*Dist, MaxStride, TypeByteSize))
2045+
if (HasSameSize &&
2046+
isSafeDependenceDistance(
2047+
DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()), *Dist, MaxStride))
20452048
return Dependence::NoDep;
20462049

20472050
const SCEVConstant *ConstDist = dyn_cast<SCEVConstant>(Dist);
@@ -2145,8 +2148,8 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
21452148

21462149
// It's not vectorizable if the distance is smaller than the minimum distance
21472150
// needed for a vectroized/unrolled version. Vectorizing one iteration in
2148-
// front needs TypeByteSize * Stride. Vectorizing the last iteration needs
2149-
// TypeByteSize (No need to plus the last gap distance).
2151+
// front needs CommonStride. Vectorizing the last iteration needs TypeByteSize
2152+
// (No need to plus the last gap distance).
21502153
//
21512154
// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
21522155
// foo(int *A) {
@@ -2173,8 +2176,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
21732176
// We know that Dist is positive, but it may not be constant. Use the signed
21742177
// minimum for computations below, as this ensures we compute the closest
21752178
// possible dependence distance.
2176-
uint64_t MinDistanceNeeded =
2177-
TypeByteSize * *CommonStride * (MinNumIter - 1) + TypeByteSize;
2179+
uint64_t MinDistanceNeeded = *CommonStride * (MinNumIter - 1) + TypeByteSize;
21782180
if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
21792181
if (!ConstDist) {
21802182
// For non-constant distances, we checked the lower bound of the
@@ -2230,7 +2232,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
22302232

22312233
// An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
22322234
// since there is a backwards dependency.
2233-
uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * *CommonStride);
2235+
uint64_t MaxVF = MinDepDistBytes / *CommonStride;
22342236
LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
22352237
<< " with max VF = " << MaxVF << '\n');
22362238

0 commit comments

Comments
 (0)