Skip to content

Commit 6614571

Browse files
Pierre-vhtru
authored andcommitted
[AMDGPU] Fix PromoteAlloca Subvector Stores for Single Elements
The previous condition was incorrect in some cases, like storing <2 x i32> into a double. If IndexVal was >0, we ended up never storing anything. Reviewed By: #amdgpu, arsenm Differential Revision: https://reviews.llvm.org/D156308 (cherry picked from commit a8aabba)
1 parent b56a38f commit 6614571

File tree

2 files changed

+54
-27
lines changed

2 files changed

+54
-27
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,8 @@ static Value *promoteAllocaUserToVector(
386386
};
387387

388388
Type *VecEltTy = VectorTy->getElementType();
389+
const unsigned NumVecElts = VectorTy->getNumElements();
390+
389391
switch (Inst->getOpcode()) {
390392
case Instruction::Load: {
391393
// Loads can only be lowered if the value is known.
@@ -413,13 +415,13 @@ static Value *promoteAllocaUserToVector(
413415
// Loading a subvector.
414416
if (isa<FixedVectorType>(AccessTy)) {
415417
assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
416-
const unsigned NumElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
417-
auto *SubVecTy = FixedVectorType::get(VecEltTy, NumElts);
418+
const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
419+
auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
418420
assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
419421

420422
unsigned IndexVal = cast<ConstantInt>(Index)->getZExtValue();
421423
Value *SubVec = PoisonValue::get(SubVecTy);
422-
for (unsigned K = 0; K < NumElts; ++K) {
424+
for (unsigned K = 0; K < NumLoadedElts; ++K) {
423425
SubVec = Builder.CreateInsertElement(
424426
SubVec, Builder.CreateExtractElement(CurVal, IndexVal + K), K);
425427
}
@@ -465,8 +467,9 @@ static Value *promoteAllocaUserToVector(
465467
// Storing a subvector.
466468
if (isa<FixedVectorType>(AccessTy)) {
467469
assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
468-
const unsigned NumElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
469-
auto *SubVecTy = FixedVectorType::get(VecEltTy, NumElts);
470+
const unsigned NumWrittenElts =
471+
AccessSize / DL.getTypeStoreSize(VecEltTy);
472+
auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
470473
assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
471474

472475
if (SubVecTy->isPtrOrPtrVectorTy())
@@ -478,7 +481,8 @@ static Value *promoteAllocaUserToVector(
478481

479482
unsigned IndexVal = cast<ConstantInt>(Index)->getZExtValue();
480483
Value *CurVec = GetOrLoadCurrentVectorValue();
481-
for (unsigned K = 0; (IndexVal + K) < NumElts; ++K) {
484+
for (unsigned K = 0; K < NumWrittenElts && ((IndexVal + K) < NumVecElts);
485+
++K) {
482486
CurVec = Builder.CreateInsertElement(
483487
CurVec, Builder.CreateExtractElement(Val, K), IndexVal + K);
484488
}

llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll

Lines changed: 44 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,21 @@ define void @test_trivial_subvector(<2 x i64> %val.0, <2 x i64> %val.1) {
1313
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[TMP2]], i64 1
1414
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[VAL_1]], i64 0
1515
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[TMP4]], i64 1
16-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0
17-
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[TMP4]], i64 1
18-
; CHECK-NEXT: [[DUMMYUSER:%.*]] = freeze <2 x i64> [[TMP7]]
19-
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0
20-
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> [[TMP8]], i64 undef, i64 1
21-
; CHECK-NEXT: [[DUMMYUSER_1:%.*]] = freeze <2 x i64> [[TMP9]]
22-
; CHECK-NEXT: [[DUMMYUSER_2:%.*]] = freeze <2 x i64> undef
16+
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[VAL_1]], i64 1
17+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[TMP6]], i64 2
18+
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[VAL_1]], i64 0
19+
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i64> [[TMP7]], i64 [[TMP8]], i64 2
20+
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[VAL_1]], i64 1
21+
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i64> [[TMP9]], i64 [[TMP10]], i64 3
22+
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0
23+
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP4]], i64 1
24+
; CHECK-NEXT: [[DUMMYUSER:%.*]] = freeze <2 x i64> [[TMP13]]
25+
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0
26+
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP8]], i64 1
27+
; CHECK-NEXT: [[DUMMYUSER_1:%.*]] = freeze <2 x i64> [[TMP15]]
28+
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i64 0
29+
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i64> [[TMP16]], i64 [[TMP10]], i64 1
30+
; CHECK-NEXT: [[DUMMYUSER_2:%.*]] = freeze <2 x i64> [[TMP17]]
2331
; CHECK-NEXT: ret void
2432
;
2533
entry:
@@ -58,17 +66,30 @@ define void @test_different_type_subvector(<4 x i32> %val.0, <8 x i16> %val.1, <
5866
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i16> [[VAL_1]] to <2 x i64>
5967
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP8]], i64 0
6068
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i64> [[TMP4]], i64 [[TMP9]], i64 1
61-
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i64 0
62-
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 undef, i64 1
63-
; CHECK-NEXT: [[TMP13:%.*]] = bitcast <2 x i64> [[TMP12]] to <8 x i16>
64-
; CHECK-NEXT: [[DUMMYUSE_1:%.*]] = freeze <8 x i16> [[TMP13]]
65-
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i8> [[VAL_2]] to <2 x i64>
66-
; CHECK-NEXT: [[DUMMYUSE_2:%.*]] = freeze <4 x i32> undef
67-
; CHECK-NEXT: [[TMP15:%.*]] = bitcast <128 x i1> [[VAL_3]] to <2 x i64>
68-
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i64 0
69-
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i64> [[TMP16]], i64 undef, i64 1
70-
; CHECK-NEXT: [[TMP18:%.*]] = bitcast <2 x i64> [[TMP17]] to <128 x i1>
71-
; CHECK-NEXT: [[DUMMYUSE_I1:%.*]] = freeze <128 x i1> [[TMP18]]
69+
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP8]], i64 1
70+
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP11]], i64 2
71+
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i64 0
72+
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[TMP11]], i64 1
73+
; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP14]] to <8 x i16>
74+
; CHECK-NEXT: [[DUMMYUSE_1:%.*]] = freeze <8 x i16> [[TMP15]]
75+
; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x i8> [[VAL_2]] to <2 x i64>
76+
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP16]], i64 0
77+
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[TMP17]], i64 2
78+
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i64> [[TMP16]], i64 1
79+
; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP19]], i64 3
80+
; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i64> poison, i64 [[TMP17]], i64 0
81+
; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i64> [[TMP21]], i64 [[TMP19]], i64 1
82+
; CHECK-NEXT: [[TMP23:%.*]] = bitcast <2 x i64> [[TMP22]] to <4 x i32>
83+
; CHECK-NEXT: [[DUMMYUSE_2:%.*]] = freeze <4 x i32> [[TMP23]]
84+
; CHECK-NEXT: [[TMP24:%.*]] = bitcast <128 x i1> [[VAL_3]] to <2 x i64>
85+
; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i64> [[TMP24]], i64 0
86+
; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP25]], i64 2
87+
; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x i64> [[TMP24]], i64 1
88+
; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x i64> [[TMP26]], i64 [[TMP27]], i64 3
89+
; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i64 0
90+
; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i64> [[TMP29]], i64 [[TMP25]], i64 1
91+
; CHECK-NEXT: [[TMP31:%.*]] = bitcast <2 x i64> [[TMP30]] to <128 x i1>
92+
; CHECK-NEXT: [[DUMMYUSE_I1:%.*]] = freeze <128 x i1> [[TMP31]]
7293
; CHECK-NEXT: ret void
7394
;
7495
entry:
@@ -310,6 +331,8 @@ define void @test_out_of_bounds_subvec(<2 x i64> %val) {
310331
; CHECK-LABEL: define void @test_out_of_bounds_subvec
311332
; CHECK-SAME: (<2 x i64> [[VAL:%.*]]) {
312333
; CHECK-NEXT: entry:
334+
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[VAL]], i64 0
335+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP0]], i64 3
313336
; CHECK-NEXT: ret void
314337
;
315338
entry:
@@ -351,9 +374,9 @@ define void @store_2xi32_into_double(double %foo) {
351374
; CHECK-LABEL: define void @store_2xi32_into_double
352375
; CHECK-SAME: (double [[FOO:%.*]]) {
353376
; CHECK-NEXT: [[DUMMYUSER0:%.*]] = freeze double 0x5F0000005E
354-
; CHECK-NEXT: [[DUMMYUSER1:%.*]] = freeze double undef
355-
; CHECK-NEXT: [[DUMMYUSER2:%.*]] = freeze double undef
356-
; CHECK-NEXT: [[DUMMYUSER3:%.*]] = freeze double undef
377+
; CHECK-NEXT: [[DUMMYUSER1:%.*]] = freeze double 0x6700000066
378+
; CHECK-NEXT: [[DUMMYUSER2:%.*]] = freeze double 0x6900000068
379+
; CHECK-NEXT: [[DUMMYUSER3:%.*]] = freeze double 0x6F0000006E
357380
; CHECK-NEXT: ret void
358381
;
359382
%alloca = alloca [9 x double], align 8, addrspace(5)

0 commit comments

Comments
 (0)