Skip to content

Commit b9aa155

Browse files
[TTI][X86]Fix detection of the shuffles from the second shuffle operand only
If the shuffle mask uses only indices from the second shuffle operand, processShuffleMasks function misses it currently, which prevents correct cost estimation in this corner case. To fix this, need to raise the limit to 2 * VF rather than just VF and adjust processing correspondingly. Will allow future improvements for 2 sources permutations. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm#118972
1 parent 2e33ed9 commit b9aa155

File tree

5 files changed

+11
-10
lines changed

5 files changed

+11
-10
lines changed

llvm/lib/Analysis/VectorUtils.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -504,25 +504,26 @@ void llvm::processShuffleMasks(
504504
unsigned SzSrc = Sz / NumOfSrcRegs;
505505
for (unsigned I = 0; I < NumOfDestRegs; ++I) {
506506
auto &RegMasks = Res[I];
507-
RegMasks.assign(NumOfSrcRegs, {});
507+
RegMasks.assign(2 * NumOfSrcRegs, {});
508508
// Check that the values in dest registers are in the one src
509509
// register.
510510
for (unsigned K = 0; K < SzDest; ++K) {
511511
int Idx = I * SzDest + K;
512512
if (Idx == Sz)
513513
break;
514-
if (Mask[Idx] >= Sz || Mask[Idx] == PoisonMaskElem)
514+
if (Mask[Idx] >= 2 * Sz || Mask[Idx] == PoisonMaskElem)
515515
continue;
516-
int SrcRegIdx = Mask[Idx] / SzSrc;
516+
int MaskIdx = Mask[Idx] % Sz;
517+
int SrcRegIdx = MaskIdx / SzSrc + (Mask[Idx] >= Sz ? NumOfSrcRegs : 0);
517518
// Add a cost of PermuteTwoSrc for each new source register permute,
518519
// if we have more than one source registers.
519520
if (RegMasks[SrcRegIdx].empty())
520521
RegMasks[SrcRegIdx].assign(SzDest, PoisonMaskElem);
521-
RegMasks[SrcRegIdx][K] = Mask[Idx] % SzSrc;
522+
RegMasks[SrcRegIdx][K] = MaskIdx % SzSrc;
522523
}
523524
}
524525
// Process split mask.
525-
for (unsigned I = 0; I < NumOfUsedRegs; ++I) {
526+
for (unsigned I : seq<unsigned>(NumOfUsedRegs)) {
526527
auto &Dest = Res[I];
527528
int NumSrcRegs =
528529
count_if(Dest, [](ArrayRef<int> Mask) { return !Mask.empty(); });
@@ -567,7 +568,7 @@ void llvm::processShuffleMasks(
567568
int FirstIdx = -1;
568569
SecondIdx = -1;
569570
MutableArrayRef<int> FirstMask, SecondMask;
570-
for (unsigned I = 0; I < NumOfDestRegs; ++I) {
571+
for (unsigned I : seq<unsigned>(2 * NumOfSrcRegs)) {
571572
SmallVectorImpl<int> &RegMask = Dest[I];
572573
if (RegMask.empty())
573574
continue;

llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,7 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
483483
; SSE-LABEL: 'test_upper_vXf32'
484484
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
485485
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
486-
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
486+
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
487487
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
488488
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
489489
;

llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,7 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
483483
; SSE-LABEL: 'test_upper_vXf32'
484484
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
485485
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
486-
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
486+
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
487487
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
488488
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
489489
;

llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,7 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
483483
; SSE-LABEL: 'test_upper_vXf32'
484484
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
485485
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
486-
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
486+
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
487487
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
488488
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
489489
;

llvm/test/Analysis/CostModel/X86/shuffle-splat.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,7 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
483483
; SSE-LABEL: 'test_upper_vXf32'
484484
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
485485
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
486-
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
486+
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
487487
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
488488
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
489489
;

0 commit comments

Comments
 (0)