Skip to content

Commit 97c4cb4

Browse files
authored
[SLP][REVEC] getNumElements should not be used as VF when REVEC is enabled. (#134763)
1 parent c1e95b2 commit 97c4cb4

File tree

2 files changed

+44
-5
lines changed

2 files changed

+44
-5
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16080,11 +16080,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
1608016080
unsigned VF = std::max(CommonMask.size(), Mask.size());
1608116081
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
1608216082
if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
16083-
CommonMask[Idx] =
16084-
V->getType() != V1->getType()
16085-
? Idx + VF
16086-
: Mask[Idx] + cast<FixedVectorType>(V1->getType())
16087-
->getNumElements();
16083+
CommonMask[Idx] = V->getType() != V1->getType()
16084+
? Idx + VF
16085+
: Mask[Idx] + getVF(V1);
1608816086
if (V->getType() != V1->getType())
1608916087
V1 = createShuffle(V1, nullptr, Mask);
1609016088
InVectors.front() = V;

llvm/test/Transforms/SLPVectorizer/revec.ll

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,3 +481,44 @@ for.end.loopexit:
481481
%or0 = or <4 x i16> %phi1, zeroinitializer
482482
ret void
483483
}
484+
485+
define i32 @test15() {
486+
; CHECK-LABEL: @test15(
487+
; CHECK-NEXT: entry:
488+
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr null, i64 480
489+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 160
490+
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, ptr [[TMP1]], align 16
491+
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[TMP1]], align 16
492+
; CHECK-NEXT: store <4 x float> [[TMP3]], ptr null, align 16
493+
; CHECK-NEXT: [[TMP4:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> poison, <4 x float> zeroinitializer, i64 0)
494+
; CHECK-NEXT: [[TMP5:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP4]], <4 x float> zeroinitializer, i64 4)
495+
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP5]], <4 x float> zeroinitializer, i64 8)
496+
; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP6]], <4 x float> zeroinitializer, i64 12)
497+
; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> poison, <4 x float> zeroinitializer, i64 8)
498+
; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP8]], <4 x float> zeroinitializer, i64 12)
499+
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
500+
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
501+
; CHECK-NEXT: [[TMP12:%.*]] = fadd <16 x float> [[TMP7]], [[TMP11]]
502+
; CHECK-NEXT: store <16 x float> [[TMP12]], ptr [[TMP0]], align 16
503+
; CHECK-NEXT: ret i32 0
504+
;
505+
entry:
506+
%0 = getelementptr i8, ptr null, i64 512
507+
%1 = getelementptr i8, ptr null, i64 528
508+
%2 = getelementptr i8, ptr null, i64 480
509+
%3 = getelementptr i8, ptr null, i64 496
510+
%4 = getelementptr i8, ptr null, i64 160
511+
%5 = load <4 x float>, ptr %4, align 16
512+
%6 = getelementptr i8, ptr null, i64 176
513+
%7 = load <4 x float>, ptr %6, align 16
514+
store <4 x float> %5, ptr null, align 16
515+
%8 = fadd <4 x float> zeroinitializer, %5
516+
%9 = fadd <4 x float> zeroinitializer, %7
517+
store <4 x float> %8, ptr %2, align 16
518+
store <4 x float> %9, ptr %3, align 16
519+
%10 = fadd <4 x float> zeroinitializer, zeroinitializer
520+
%11 = fadd <4 x float> zeroinitializer, zeroinitializer
521+
store <4 x float> %10, ptr %0, align 16
522+
store <4 x float> %11, ptr %1, align 16
523+
ret i32 0
524+
}

0 commit comments

Comments
 (0)