Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -796,6 +796,16 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
if (!IsSimple)
return RejectUser(Inst, "not a simple load or store");

// If the access type is a pointer, reject the address spaces with
// different pointer sizes.
// store <2 x ptr> %arg, ptr addrspace(5) %alloca - Reject.
// %tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca - ok.
if (AccessTy->isPtrOrPtrVectorTy()) {
if (DL->getPointerSize(getLoadStoreAddressSpace(Inst)) !=
DL->getPointerSize(AccessTy->getPointerAddressSpace()))
return RejectUser(Inst, "pointers to incompatible address spaces");
}

Ptr = Ptr->stripPointerCasts();

// Alloca already accessed as vector.
Expand Down
67 changes: 36 additions & 31 deletions llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
Original file line number Diff line number Diff line change
Expand Up @@ -93,21 +93,6 @@ end:
ret void
}

define ptr @alloca_load_store_ptr64_full_ivec(ptr %arg) {
; CHECK-LABEL: define ptr @alloca_load_store_ptr64_full_ivec
; CHECK-SAME: (ptr [[ARG:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[ARG]] to i64
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <8 x i8>
; CHECK-NEXT: ret ptr [[ARG]]
;
entry:
%alloca = alloca [8 x i8], align 8, addrspace(5)
store ptr %arg, ptr addrspace(5) %alloca, align 8
%tmp = load ptr, ptr addrspace(5) %alloca, align 8
ret ptr %tmp
}

define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec(ptr addrspace(3) %arg) {
; CHECK-LABEL: define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec
; CHECK-SAME: (ptr addrspace(3) [[ARG:%.*]]) {
Expand All @@ -123,22 +108,6 @@ entry:
ret ptr addrspace(3) %tmp
}

define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec(<2 x ptr> %arg) {
; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec
; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr> [[ARG]] to <2 x i64>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = inttoptr <4 x i32> [[TMP1]] to <4 x ptr addrspace(3)>
; CHECK-NEXT: ret <4 x ptr addrspace(3)> [[TMP2]]
;
entry:
%alloca = alloca [4 x i32], align 8, addrspace(5)
store <2 x ptr> %arg, ptr addrspace(5) %alloca, align 8
%tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8
ret <4 x ptr addrspace(3)> %tmp
}

define <8 x i16> @ptralloca_load_store_ints_full(<2 x i64> %arg) {
; CHECK-LABEL: define <8 x i16> @ptralloca_load_store_ints_full
; CHECK-SAME: (<2 x i64> [[ARG:%.*]]) {
Expand Down Expand Up @@ -198,3 +167,39 @@ entry:
%tmp = load ptr addrspace(3), ptr addrspace(5) %alloca, align 8
ret ptr addrspace(3) %tmp
}

; Will not vectorize because we are doing a load/store of a pointer across
; address spaces of varying pointer sizes.
define ptr @alloca_load_store_ptr64_full_ivec(ptr %arg) {
; CHECK-LABEL: define ptr @alloca_load_store_ptr64_full_ivec
; CHECK-SAME: (ptr [[ARG:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [8 x i8], align 8, addrspace(5)
; CHECK-NEXT: store ptr [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8
; CHECK-NEXT: [[TMP:%.*]] = load ptr, ptr addrspace(5) [[ALLOCA]], align 8
; CHECK-NEXT: ret ptr [[TMP]]
;
entry:
%alloca = alloca [8 x i8], align 8, addrspace(5)
store ptr %arg, ptr addrspace(5) %alloca, align 8
%tmp = load ptr, ptr addrspace(5) %alloca, align 8
ret ptr %tmp
}

; Will not vectorize because we are doing a load/store of a pointer across
; address spaces of varying pointer sizes.
define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec(<2 x ptr> %arg) {
; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec
; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4 x i32], align 8, addrspace(5)
; CHECK-NEXT: store <2 x ptr> [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8
; CHECK-NEXT: [[TMP:%.*]] = load <4 x ptr addrspace(3)>, ptr addrspace(5) [[ALLOCA]], align 8
; CHECK-NEXT: ret <4 x ptr addrspace(3)> [[TMP]]
;
entry:
%alloca = alloca [4 x i32], align 8, addrspace(5)
store <2 x ptr> %arg, ptr addrspace(5) %alloca, align 8
%tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8
ret <4 x ptr addrspace(3)> %tmp
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should reproduce the same situation with one as a scalar, and some with int/fp types

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please be specific here on what you mean by scalar ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i32 and double

51 changes: 24 additions & 27 deletions llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -218,38 +218,35 @@ entry:
ret void
}

define void @test_different_type_subvector_ptrs(<2 x ptr addrspace(1)> %val.0, <4 x ptr addrspace(3)> %val.1) {
define void @test_different_type_subvector_ptrs(<2 x ptr addrspace(3)> %val.0, <4 x ptr addrspace(3)> %val.1) {
; CHECK-LABEL: define void @test_different_type_subvector_ptrs
; CHECK-SAME: (<2 x ptr addrspace(1)> [[VAL_0:%.*]], <4 x ptr addrspace(3)> [[VAL_1:%.*]]) {
; CHECK-SAME: (<2 x ptr addrspace(3)> [[VAL_0:%.*]], <4 x ptr addrspace(3)> [[VAL_1:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(1)> [[VAL_0]] to <2 x i64>
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> undef, i64 [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[TMP3]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[TMP1]], i64 0
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[TMP3]], i64 1
; CHECK-NEXT: [[TMP7:%.*]] = inttoptr <2 x i64> [[TMP6]] to <2 x ptr addrspace(1)>
; CHECK-NEXT: [[DUMMYUSER:%.*]] = freeze <2 x ptr addrspace(1)> [[TMP7]]
; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint <4 x ptr addrspace(3)> [[VAL_1]] to <4 x i32>
; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <2 x i64>
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i64 0
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i64> [[TMP4]], i64 [[TMP10]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i64 1
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[TMP12]], i32 1
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i64 0
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP12]], i64 1
; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP15]] to <4 x i32>
; CHECK-NEXT: [[TMP17:%.*]] = inttoptr <4 x i32> [[TMP16]] to <4 x ptr addrspace(3)>
; CHECK-NEXT: [[DUMMYUSER_1:%.*]] = freeze <4 x ptr addrspace(3)> [[TMP17]]
; CHECK-NEXT: ret void
;
; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(3)> [[VAL_0]] to <2 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i64 0
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> undef, i64 [[TMP2]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i64 0
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <2 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = inttoptr <2 x i32> [[TMP5]] to <2 x ptr addrspace(3)>
; CHECK-NEXT: [[DUMMYUSER:%.*]] = freeze <2 x ptr addrspace(3)> [[TMP6]]
; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint <4 x ptr addrspace(3)> [[VAL_1]] to <4 x i32>
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <2 x i64>
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP8]], i64 0
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[TMP9]], i32 0
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP8]], i64 1
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP11]], i32 1
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i64 0
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[TMP11]], i64 1
; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP14]] to <4 x i32>
; CHECK-NEXT: [[TMP16:%.*]] = inttoptr <4 x i32> [[TMP15]] to <4 x ptr addrspace(3)>
; CHECK-NEXT: [[DUMMYUSER_1:%.*]] = freeze <4 x ptr addrspace(3)> [[TMP16]]
entry:
%stack = alloca [4 x i64], align 4, addrspace(5)

store <2 x ptr addrspace(1)> %val.0, ptr addrspace(5) %stack
%reload = load <2 x ptr addrspace(1)>, ptr addrspace(5) %stack
%dummyuser = freeze <2 x ptr addrspace(1)> %reload
store <2 x ptr addrspace(3)> %val.0, ptr addrspace(5) %stack
%reload = load <2 x ptr addrspace(3)>, ptr addrspace(5) %stack
%dummyuser = freeze <2 x ptr addrspace(3)> %reload

store <4 x ptr addrspace(3)> %val.1, ptr addrspace(5) %stack
%reload.1 = load <4 x ptr addrspace(3)>, ptr addrspace(5) %stack
Expand Down
Loading