Skip to content

Commit 02e37c6

Browse files
committed
[AMDGPU] Restrict promote alloca on pointers across address spaces
If the load/store of a pointer to stack that is not in the same address space, we restrict the promote alloca pass not to vectorize if the pointer storage sizes are different. Example: In address space 0, pointer size is 64 bits. In address space 5, pointer size if 32 bits. Casting the pointer across these address spaces with varied pointer sizes is undefined behavior Assertion found through fuzzing.
1 parent 770364b commit 02e37c6

File tree

3 files changed

+99
-67
lines changed

3 files changed

+99
-67
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -674,7 +674,17 @@ static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
674674
//
675675
// We could handle more complicated cases, but it'd make things a lot more
676676
// complicated.
677-
if (isa<FixedVectorType>(AccessTy)) {
677+
678+
// If both are pointer types, verify if they are compatible to copy across
679+
// address spaces.
680+
bool canCopyAcrossAddressSpaces = true;
681+
if (AccessTy->isPtrOrPtrVectorTy() && VecTy->isPtrOrPtrVectorTy()) {
682+
if (DL.getPointerSize(AccessTy->getPointerAddressSpace()) !=
683+
DL.getPointerSize(VecTy->getPointerAddressSpace()))
684+
canCopyAcrossAddressSpaces = false;
685+
}
686+
687+
if (isa<FixedVectorType>(AccessTy) && canCopyAcrossAddressSpaces) {
678688
TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
679689
TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
680690
return AccTS.isKnownMultipleOf(VecTS);
@@ -796,23 +806,17 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
796806
if (!IsSimple)
797807
return RejectUser(Inst, "not a simple load or store");
798808

799-
// If the access type is a pointer, reject the address spaces with
800-
// different pointer sizes.
801-
// store <2 x ptr> %arg, ptr addrspace(5) %alloca - Reject.
802-
// %tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca - ok.
803-
if (AccessTy->isPtrOrPtrVectorTy()) {
804-
if (DL->getPointerSize(getLoadStoreAddressSpace(Inst)) !=
805-
DL->getPointerSize(AccessTy->getPointerAddressSpace()))
806-
return RejectUser(Inst, "pointers to incompatible address spaces");
807-
}
808-
809809
Ptr = Ptr->stripPointerCasts();
810810

811-
// Alloca already accessed as vector.
812-
if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
813-
DL->getTypeStoreSize(AccessTy)) {
814-
WorkList.push_back(Inst);
815-
continue;
811+
// Since the size of pointer is different across address spaces, let the
812+
// isSupportedAccessType() handle the pointer load and store accesses.
813+
if (!AccessTy->isPtrOrPtrVectorTy() || !VectorTy->isPtrOrPtrVectorTy()) {
814+
// Alloca already accessed as vector.
815+
if (Ptr == &Alloca &&
816+
DL->getTypeStoreSize(AllocaTy) == DL->getTypeStoreSize(AccessTy)) {
817+
WorkList.push_back(Inst);
818+
continue;
819+
}
816820
}
817821

818822
if (!isSupportedAccessType(VectorTy, AccessTy, *DL))

llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll

Lines changed: 52 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,21 @@ end:
9393
ret void
9494
}
9595

96+
define ptr @alloca_load_store_ptr64_full_ivec(ptr %arg) {
97+
; CHECK-LABEL: define ptr @alloca_load_store_ptr64_full_ivec
98+
; CHECK-SAME: (ptr [[ARG:%.*]]) {
99+
; CHECK-NEXT: entry:
100+
; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[ARG]] to i64
101+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <8 x i8>
102+
; CHECK-NEXT: ret ptr [[ARG]]
103+
;
104+
entry:
105+
%alloca = alloca [8 x i8], align 8, addrspace(5)
106+
store ptr %arg, ptr addrspace(5) %alloca, align 8
107+
%tmp = load ptr, ptr addrspace(5) %alloca, align 8
108+
ret ptr %tmp
109+
}
110+
96111
define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec(ptr addrspace(3) %arg) {
97112
; CHECK-LABEL: define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec
98113
; CHECK-SAME: (ptr addrspace(3) [[ARG:%.*]]) {
@@ -108,6 +123,22 @@ entry:
108123
ret ptr addrspace(3) %tmp
109124
}
110125

126+
define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec(<2 x ptr> %arg) {
127+
; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec
128+
; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) {
129+
; CHECK-NEXT: entry:
130+
; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr> [[ARG]] to <2 x i64>
131+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32>
132+
; CHECK-NEXT: [[TMP2:%.*]] = inttoptr <4 x i32> [[TMP1]] to <4 x ptr addrspace(3)>
133+
; CHECK-NEXT: ret <4 x ptr addrspace(3)> [[TMP2]]
134+
;
135+
entry:
136+
%alloca = alloca [4 x i32], align 8, addrspace(5)
137+
store <2 x ptr> %arg, ptr addrspace(5) %alloca, align 8
138+
%tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8
139+
ret <4 x ptr addrspace(3)> %tmp
140+
}
141+
111142
define <8 x i16> @ptralloca_load_store_ints_full(<2 x i64> %arg) {
112143
; CHECK-LABEL: define <8 x i16> @ptralloca_load_store_ints_full
113144
; CHECK-SAME: (<2 x i64> [[ARG:%.*]]) {
@@ -168,38 +199,32 @@ entry:
168199
ret ptr addrspace(3) %tmp
169200
}
170201

171-
; Will not vectorize because we are doing a load/store of a pointer across
172-
; address spaces of varying pointer sizes.
173-
define ptr @alloca_load_store_ptr64_full_ivec(ptr %arg) {
174-
; CHECK-LABEL: define ptr @alloca_load_store_ptr64_full_ivec
175-
; CHECK-SAME: (ptr [[ARG:%.*]]) {
202+
; Will not vectorize because we're saving a 64-bit pointer from addrspace 0
203+
; in to two 32 bits pointers of addrspace 5.
204+
; CHECK-LABEL: define void @alloca_load_store_ptr_mixed_addrspace_ptrvec
176205
; CHECK-NEXT: entry:
177-
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [8 x i8], align 8, addrspace(5)
178-
; CHECK-NEXT: store ptr [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8
179-
; CHECK-NEXT: [[TMP:%.*]] = load ptr, ptr addrspace(5) [[ALLOCA]], align 8
180-
; CHECK-NEXT: ret ptr [[TMP]]
206+
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca <2 x ptr addrspace(5)>, align 8, addrspace(5)
207+
; CHECK-NEXT: store ptr undef, ptr addrspace(5) [[ALLOCA]], align 8
208+
; CHECK-NEXT: ret void
181209
;
210+
define void @alloca_load_store_ptr_mixed_addrspace_ptrvec() {
182211
entry:
183-
%alloca = alloca [8 x i8], align 8, addrspace(5)
184-
store ptr %arg, ptr addrspace(5) %alloca, align 8
185-
%tmp = load ptr, ptr addrspace(5) %alloca, align 8
186-
ret ptr %tmp
212+
%A2 = alloca <2 x ptr addrspace(5)>, align 8, addrspace(5)
213+
store ptr undef, ptr addrspace(5) %A2, align 8
214+
ret void
187215
}
188216

189-
; Will not vectorize because we are doing a load/store of a pointer across
190-
; address spaces of varying pointer sizes.
191-
define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec(<2 x ptr> %arg) {
192-
; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec
193-
; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) {
217+
; Will not vectorize because we're saving a 32-bit pointers from addrspace 5
218+
; in to two 64 bits pointers of addrspace 0, even though the size in memory
219+
; is same.
220+
; CHECK-LABEL: define void @alloca_load_store_ptr_mixed_addrspace_ptrvec2
194221
; CHECK-NEXT: entry:
195-
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4 x i32], align 8, addrspace(5)
196-
; CHECK-NEXT: store <2 x ptr> [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8
197-
; CHECK-NEXT: [[TMP:%.*]] = load <4 x ptr addrspace(3)>, ptr addrspace(5) [[ALLOCA]], align 8
198-
; CHECK-NEXT: ret <4 x ptr addrspace(3)> [[TMP]]
199-
;
222+
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca <2 x ptr>, align 8
223+
; CHECK-NEXT: store <4 x ptr addrspace(5)> undef, ptr [[ALLOCA]], align 8
224+
; CHECK-NEXT: ret void
225+
define void @alloca_load_store_ptr_mixed_addrspace_ptrvec2() {
200226
entry:
201-
%alloca = alloca [4 x i32], align 8, addrspace(5)
202-
store <2 x ptr> %arg, ptr addrspace(5) %alloca, align 8
203-
%tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8
204-
ret <4 x ptr addrspace(3)> %tmp
227+
%A2 = alloca <2 x ptr>, align 8
228+
store <4 x ptr addrspace(5)> undef, ptr %A2, align 8
229+
ret void
205230
}

llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll

Lines changed: 27 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -218,35 +218,38 @@ entry:
218218
ret void
219219
}
220220

221-
define void @test_different_type_subvector_ptrs(<2 x ptr addrspace(3)> %val.0, <4 x ptr addrspace(3)> %val.1) {
221+
define void @test_different_type_subvector_ptrs(<2 x ptr addrspace(1)> %val.0, <4 x ptr addrspace(3)> %val.1) {
222222
; CHECK-LABEL: define void @test_different_type_subvector_ptrs
223-
; CHECK-SAME: (<2 x ptr addrspace(3)> [[VAL_0:%.*]], <4 x ptr addrspace(3)> [[VAL_1:%.*]]) {
223+
; CHECK-SAME: (<2 x ptr addrspace(1)> [[VAL_0:%.*]], <4 x ptr addrspace(3)> [[VAL_1:%.*]]) {
224224
; CHECK-NEXT: entry:
225-
; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(3)> [[VAL_0]] to <2 x i32>
226-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
227-
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i64 0
228-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> undef, i64 [[TMP2]], i32 0
229-
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i64 0
230-
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <2 x i32>
231-
; CHECK-NEXT: [[TMP6:%.*]] = inttoptr <2 x i32> [[TMP5]] to <2 x ptr addrspace(3)>
232-
; CHECK-NEXT: [[DUMMYUSER:%.*]] = freeze <2 x ptr addrspace(3)> [[TMP6]]
233-
; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint <4 x ptr addrspace(3)> [[VAL_1]] to <4 x i32>
234-
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <2 x i64>
235-
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP8]], i64 0
236-
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[TMP9]], i32 0
237-
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP8]], i64 1
238-
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP11]], i32 1
239-
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i64 0
240-
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[TMP11]], i64 1
241-
; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP14]] to <4 x i32>
242-
; CHECK-NEXT: [[TMP16:%.*]] = inttoptr <4 x i32> [[TMP15]] to <4 x ptr addrspace(3)>
243-
; CHECK-NEXT: [[DUMMYUSER_1:%.*]] = freeze <4 x ptr addrspace(3)> [[TMP16]]
225+
; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(1)> [[VAL_0]] to <2 x i64>
226+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0
227+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> undef, i64 [[TMP1]], i32 0
228+
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1
229+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[TMP3]], i32 1
230+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[TMP1]], i64 0
231+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[TMP3]], i64 1
232+
; CHECK-NEXT: [[TMP7:%.*]] = inttoptr <2 x i64> [[TMP6]] to <2 x ptr addrspace(1)>
233+
; CHECK-NEXT: [[DUMMYUSER:%.*]] = freeze <2 x ptr addrspace(1)> [[TMP7]]
234+
; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint <4 x ptr addrspace(3)> [[VAL_1]] to <4 x i32>
235+
; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <2 x i64>
236+
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i64 0
237+
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i64> [[TMP4]], i64 [[TMP10]], i32 0
238+
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i64 1
239+
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[TMP12]], i32 1
240+
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i64 0
241+
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP12]], i64 1
242+
; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP15]] to <4 x i32>
243+
; CHECK-NEXT: [[TMP17:%.*]] = inttoptr <4 x i32> [[TMP16]] to <4 x ptr addrspace(3)>
244+
; CHECK-NEXT: [[DUMMYUSER_1:%.*]] = freeze <4 x ptr addrspace(3)> [[TMP17]]
245+
; CHECK-NEXT: ret void
246+
;
244247
entry:
245248
%stack = alloca [4 x i64], align 4, addrspace(5)
246249

247-
store <2 x ptr addrspace(3)> %val.0, ptr addrspace(5) %stack
248-
%reload = load <2 x ptr addrspace(3)>, ptr addrspace(5) %stack
249-
%dummyuser = freeze <2 x ptr addrspace(3)> %reload
250+
store <2 x ptr addrspace(1)> %val.0, ptr addrspace(5) %stack
251+
%reload = load <2 x ptr addrspace(1)>, ptr addrspace(5) %stack
252+
%dummyuser = freeze <2 x ptr addrspace(1)> %reload
250253

251254
store <4 x ptr addrspace(3)> %val.1, ptr addrspace(5) %stack
252255
%reload.1 = load <4 x ptr addrspace(3)>, ptr addrspace(5) %stack

0 commit comments

Comments
 (0)