Skip to content

Commit e3ec905

Browse files
frasercrmcktstellar
authored andcommitted
[MemCpyOpt] Fix a variety of scalable-type crashes
This patch fixes a variety of crashes resulting from the `MemCpyOptPass` casting `TypeSize` to a constant integer, whether implicitly or explicitly. Since the `MemsetRanges` requires a constant size to work, all but one of the fixes in this patch simply involve skipping the various optimizations for scalable types as cleanly as possible. The optimization of `byval` parameters, however, has been updated to work on scalable types in theory. In practice, this optimization is only valid when the length of the `memcpy` is known to be larger than the scalable type size, which is currently never the case. This could perhaps be done in the future using the `vscale_range` attribute. Some implicit casts have been left as they were, under the knowledge they are only called on aggregate types. These should never be scalably-sized. Reviewed By: nikic, tra Differential Revision: https://reviews.llvm.org/D109329 (cherry-picked from commit 7fb66d4)
1 parent 718280c commit e3ec905

File tree

3 files changed

+130
-13
lines changed

3 files changed

+130
-13
lines changed

llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
6565
bool processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI);
6666
bool processMemMove(MemMoveInst *M);
6767
bool performCallSlotOptzn(Instruction *cpyLoad, Instruction *cpyStore,
68-
Value *cpyDst, Value *cpySrc, uint64_t cpyLen,
68+
Value *cpyDst, Value *cpySrc, TypeSize cpyLen,
6969
Align cpyAlign, CallInst *C);
7070
bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
7171
bool processMemSetMemCpyDependence(MemCpyInst *MemCpy, MemSetInst *MemSet);

llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -178,9 +178,9 @@ class MemsetRanges {
178178
}
179179

180180
void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
181-
int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
182-
183-
addRange(OffsetFromFirst, StoreSize, SI->getPointerOperand(),
181+
TypeSize StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
182+
assert(!StoreSize.isScalable() && "Can't track scalable-typed stores");
183+
addRange(OffsetFromFirst, StoreSize.getFixedSize(), SI->getPointerOperand(),
184184
SI->getAlign().value(), SI);
185185
}
186186

@@ -371,6 +371,11 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
371371
Value *ByteVal) {
372372
const DataLayout &DL = StartInst->getModule()->getDataLayout();
373373

374+
// We can't track scalable types
375+
if (StoreInst *SI = dyn_cast<StoreInst>(StartInst))
376+
if (DL.getTypeStoreSize(SI->getOperand(0)->getType()).isScalable())
377+
return nullptr;
378+
374379
// Okay, so we now have a single store that can be splatable. Scan to find
375380
// all subsequent stores of the same value to offset from the same pointer.
376381
// Join these together into ranges, so we can decide whether contiguous blocks
@@ -426,6 +431,10 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
426431
if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
427432
break;
428433

434+
// We can't track ranges involving scalable types.
435+
if (DL.getTypeStoreSize(StoredVal->getType()).isScalable())
436+
break;
437+
429438
// Check to see if this stored value is of the same byte-splattable value.
430439
Value *StoredByte = isBytewiseValue(StoredVal, DL);
431440
if (isa<UndefValue>(ByteVal) && StoredByte)
@@ -859,7 +868,7 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
859868
/// the call write its result directly into the destination of the memcpy.
860869
bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
861870
Instruction *cpyStore, Value *cpyDest,
862-
Value *cpySrc, uint64_t cpyLen,
871+
Value *cpySrc, TypeSize cpySize,
863872
Align cpyAlign, CallInst *C) {
864873
// The general transformation to keep in mind is
865874
//
@@ -875,6 +884,10 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
875884
// src only holds uninitialized values at the moment of the call, meaning that
876885
// the memcpy can be discarded rather than moved.
877886

887+
// We can't optimize scalable types.
888+
if (cpySize.isScalable())
889+
return false;
890+
878891
// Lifetime marks shouldn't be operated on.
879892
if (Function *F = C->getCalledFunction())
880893
if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
@@ -893,13 +906,13 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
893906
uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) *
894907
srcArraySize->getZExtValue();
895908

896-
if (cpyLen < srcSize)
909+
if (cpySize < srcSize)
897910
return false;
898911

899912
// Check that accessing the first srcSize bytes of dest will not cause a
900913
// trap. Otherwise the transform is invalid since it might cause a trap
901914
// to occur earlier than it otherwise would.
902-
if (!isDereferenceableAndAlignedPointer(cpyDest, Align(1), APInt(64, cpyLen),
915+
if (!isDereferenceableAndAlignedPointer(cpyDest, Align(1), APInt(64, cpySize),
903916
DL, C, DT))
904917
return false;
905918

@@ -1452,9 +1465,10 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
14521465
// of conservatively taking the minimum?
14531466
Align Alignment = std::min(M->getDestAlign().valueOrOne(),
14541467
M->getSourceAlign().valueOrOne());
1455-
if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(),
1456-
CopySize->getZExtValue(), Alignment,
1457-
C)) {
1468+
if (performCallSlotOptzn(
1469+
M, M, M->getDest(), M->getSource(),
1470+
TypeSize::getFixed(CopySize->getZExtValue()), Alignment,
1471+
C)) {
14581472
LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n"
14591473
<< " call: " << *C << "\n"
14601474
<< " memcpy: " << *M << "\n");
@@ -1509,7 +1523,8 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
15091523
Align Alignment = std::min(M->getDestAlign().valueOrOne(),
15101524
M->getSourceAlign().valueOrOne());
15111525
if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(),
1512-
CopySize->getZExtValue(), Alignment, C)) {
1526+
TypeSize::getFixed(CopySize->getZExtValue()),
1527+
Alignment, C)) {
15131528
eraseInstruction(M);
15141529
++NumMemCpyInstr;
15151530
return true;
@@ -1584,7 +1599,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
15841599
// Find out what feeds this byval argument.
15851600
Value *ByValArg = CB.getArgOperand(ArgNo);
15861601
Type *ByValTy = CB.getParamByValType(ArgNo);
1587-
uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
1602+
TypeSize ByValSize = DL.getTypeAllocSize(ByValTy);
15881603
MemoryLocation Loc(ByValArg, LocationSize::precise(ByValSize));
15891604
MemCpyInst *MDep = nullptr;
15901605
if (EnableMemorySSA) {
@@ -1612,7 +1627,8 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
16121627

16131628
// The length of the memcpy must be larger or equal to the size of the byval.
16141629
ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
1615-
if (!C1 || C1->getValue().getZExtValue() < ByValSize)
1630+
if (!C1 || !TypeSize::isKnownGE(
1631+
TypeSize::getFixed(C1->getValue().getZExtValue()), ByValSize))
16161632
return false;
16171633

16181634
// Get the alignment of the byval. If the call doesn't specify the alignment,
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt < %s -memcpyopt -S -verify-memoryssa | FileCheck %s
3+
4+
; Check that a call featuring a scalable-vector byval argument fed by a memcpy
5+
; doesn't crash the compiler. It previously assumed the byval type's size could
6+
; be represented as a known constant amount.
7+
define void @byval_caller(i8 *%P) {
8+
; CHECK-LABEL: @byval_caller(
9+
; CHECK-NEXT: [[A:%.*]] = alloca i8, align 1
10+
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A]], i8* align 4 [[P:%.*]], i64 8, i1 false)
11+
; CHECK-NEXT: [[VA:%.*]] = bitcast i8* [[A]] to <vscale x 1 x i8>*
12+
; CHECK-NEXT: call void @byval_callee(<vscale x 1 x i8>* byval(<vscale x 1 x i8>) align 1 [[VA]])
13+
; CHECK-NEXT: ret void
14+
;
15+
%a = alloca i8
16+
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %a, i8* align 4 %P, i64 8, i1 false)
17+
%va = bitcast i8* %a to <vscale x 1 x i8>*
18+
call void @byval_callee(<vscale x 1 x i8>* align 1 byval(<vscale x 1 x i8>) %va)
19+
ret void
20+
}
21+
22+
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4, i8* align 4, i64, i1)
23+
declare void @byval_callee(<vscale x 1 x i8>* align 1 byval(<vscale x 1 x i8>))
24+
25+
; Check that two scalable-vector stores (overlapping, with a constant offset)
26+
; do not crash the compiler when checked whether or not they can be merged into
27+
; a single memset. There was previously an assumption that the stored values'
28+
; sizes could be represented by a known constant amount.
29+
define void @merge_stores_both_scalable(<vscale x 1 x i8>* %ptr) {
30+
; CHECK-LABEL: @merge_stores_both_scalable(
31+
; CHECK-NEXT: store <vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8>* [[PTR:%.*]], align 1
32+
; CHECK-NEXT: [[PTRI8:%.*]] = bitcast <vscale x 1 x i8>* [[PTR]] to i8*
33+
; CHECK-NEXT: [[PTR_NEXT:%.*]] = getelementptr i8, i8* [[PTRI8]], i64 1
34+
; CHECK-NEXT: [[PTR_NEXT_2:%.*]] = bitcast i8* [[PTR_NEXT]] to <vscale x 1 x i8>*
35+
; CHECK-NEXT: store <vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8>* [[PTR_NEXT_2]], align 1
36+
; CHECK-NEXT: ret void
37+
;
38+
store <vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8>* %ptr
39+
%ptri8 = bitcast <vscale x 1 x i8>* %ptr to i8*
40+
%ptr.next = getelementptr i8, i8* %ptri8, i64 1
41+
%ptr.next.2 = bitcast i8* %ptr.next to <vscale x 1 x i8>*
42+
store <vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8>* %ptr.next.2
43+
ret void
44+
}
45+
46+
; As above, but where the base is scalable but the subsequent store(s) are not.
47+
define void @merge_stores_first_scalable(<vscale x 1 x i8>* %ptr) {
48+
; CHECK-LABEL: @merge_stores_first_scalable(
49+
; CHECK-NEXT: store <vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8>* [[PTR:%.*]], align 1
50+
; CHECK-NEXT: [[PTRI8:%.*]] = bitcast <vscale x 1 x i8>* [[PTR]] to i8*
51+
; CHECK-NEXT: [[PTR_NEXT:%.*]] = getelementptr i8, i8* [[PTRI8]], i64 1
52+
; CHECK-NEXT: store i8 0, i8* [[PTR_NEXT]], align 1
53+
; CHECK-NEXT: ret void
54+
;
55+
store <vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8>* %ptr
56+
%ptri8 = bitcast <vscale x 1 x i8>* %ptr to i8*
57+
%ptr.next = getelementptr i8, i8* %ptri8, i64 1
58+
store i8 zeroinitializer, i8* %ptr.next
59+
ret void
60+
}
61+
62+
; As above, but where the base is not scalable but the subsequent store(s) are.
63+
define void @merge_stores_second_scalable(i8* %ptr) {
64+
; CHECK-LABEL: @merge_stores_second_scalable(
65+
; CHECK-NEXT: store i8 0, i8* [[PTR:%.*]], align 1
66+
; CHECK-NEXT: [[PTR_NEXT:%.*]] = getelementptr i8, i8* [[PTR]], i64 1
67+
; CHECK-NEXT: [[PTR_NEXT_2:%.*]] = bitcast i8* [[PTR_NEXT]] to <vscale x 1 x i8>*
68+
; CHECK-NEXT: store <vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8>* [[PTR_NEXT_2]], align 1
69+
; CHECK-NEXT: ret void
70+
;
71+
store i8 zeroinitializer, i8* %ptr
72+
%ptr.next = getelementptr i8, i8* %ptr, i64 1
73+
%ptr.next.2 = bitcast i8* %ptr.next to <vscale x 1 x i8>*
74+
store <vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8>* %ptr.next.2
75+
ret void
76+
}
77+
78+
; Check that the call-slot optimization doesn't crash when encountering scalable types.
79+
define void @callslotoptzn(<vscale x 4 x float> %val, <vscale x 4 x float>* %out) {
80+
; CHECK-LABEL: @callslotoptzn(
81+
; CHECK-NEXT: [[ALLOC:%.*]] = alloca <vscale x 4 x float>, align 16
82+
; CHECK-NEXT: [[IDX:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
83+
; CHECK-NEXT: [[BALLOC:%.*]] = getelementptr inbounds <vscale x 4 x float>, <vscale x 4 x float>* [[ALLOC]], i64 0, i64 0
84+
; CHECK-NEXT: [[STRIDE:%.*]] = getelementptr inbounds float, float* [[BALLOC]], <vscale x 4 x i32> [[IDX]]
85+
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> [[VAL:%.*]], <vscale x 4 x float*> [[STRIDE]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
86+
; CHECK-NEXT: [[LI:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[ALLOC]], align 4
87+
; CHECK-NEXT: store <vscale x 4 x float> [[LI]], <vscale x 4 x float>* [[OUT:%.*]], align 4
88+
; CHECK-NEXT: ret void
89+
;
90+
%alloc = alloca <vscale x 4 x float>, align 16
91+
%idx = tail call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
92+
%balloc = getelementptr inbounds <vscale x 4 x float>, <vscale x 4 x float>* %alloc, i64 0, i64 0
93+
%stride = getelementptr inbounds float, float* %balloc, <vscale x 4 x i32> %idx
94+
call void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> %val, <vscale x 4 x float*> %stride, i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
95+
%li = load <vscale x 4 x float>, <vscale x 4 x float>* %alloc, align 4
96+
store <vscale x 4 x float> %li, <vscale x 4 x float>* %out, align 4
97+
ret void
98+
}
99+
100+
declare <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
101+
declare void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> , <vscale x 4 x float*> , i32, <vscale x 4 x i1>)

0 commit comments

Comments
 (0)