Skip to content

Commit ca49a47

Browse files
nemanjaizmodem
authored andcommitted
[PowerPC] Fix computation of offset for load-and-splat for permuted loads
Unfortunately this is another regression from my canonicalization patch (1fed131). The patch contained two implicit assumptions: 1. That we would have a permuted load only if we are loading a partial vector 2. That a partial vector load would necessarily be as wide as the splat However, assumption 2 is not correct since it is possible to do a wider load and only splat a half of it. This patch corrects this assumption by simply checking if the load is permuted and adjusting the offset if it is. (cherry picked from commit 7d076e1)
1 parent 152c2b1 commit ca49a47

File tree

2 files changed

+106
-8
lines changed

2 files changed

+106
-8
lines changed

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9111,13 +9111,15 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
91119111
Op0.getOperand(1));
91129112
}
91139113

9114-
static const SDValue *getNormalLoadInput(const SDValue &Op) {
9114+
static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) {
91159115
const SDValue *InputLoad = &Op;
91169116
if (InputLoad->getOpcode() == ISD::BITCAST)
91179117
InputLoad = &InputLoad->getOperand(0);
91189118
if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR ||
9119-
InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED)
9119+
InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) {
9120+
IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED;
91209121
InputLoad = &InputLoad->getOperand(0);
9122+
}
91219123
if (InputLoad->getOpcode() != ISD::LOAD)
91229124
return nullptr;
91239125
LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
@@ -9289,7 +9291,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
92899291

92909292
if (!BVNIsConstantSplat || SplatBitSize > 32) {
92919293

9292-
const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));
9294+
bool IsPermutedLoad = false;
9295+
const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0), IsPermutedLoad);
92939296
// Handle load-and-splat patterns as we have instructions that will do this
92949297
// in one go.
92959298
if (InputLoad && DAG.isSplatValue(Op, true)) {
@@ -9912,14 +9915,25 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
99129915
// If this is a load-and-splat, we can do that with a single instruction
99139916
// in some cases. However if the load has multiple uses, we don't want to
99149917
// combine it because that will just produce multiple loads.
9915-
const SDValue *InputLoad = getNormalLoadInput(V1);
9918+
bool IsPermutedLoad = false;
9919+
const SDValue *InputLoad = getNormalLoadInput(V1, IsPermutedLoad);
99169920
if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
99179921
(PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) &&
99189922
InputLoad->hasOneUse()) {
99199923
bool IsFourByte = PPC::isSplatShuffleMask(SVOp, 4);
99209924
int SplatIdx =
99219925
PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG);
99229926

9927+
// The splat index for permuted loads will be in the left half of the vector
9928+
// which is strictly wider than the loaded value by 8 bytes. So we need to
9929+
// adjust the splat index to point to the correct address in memory.
9930+
if (IsPermutedLoad) {
9931+
assert(isLittleEndian && "Unexpected permuted load on big endian target");
9932+
SplatIdx += IsFourByte ? 2 : 1;
9933+
assert(SplatIdx < IsFourByte ? 4 : 2 &&
9934+
"Splat of a value outside of the loaded memory");
9935+
}
9936+
99239937
LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
99249938
// For 4-byte load-and-splat, we need Power9.
99259939
if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) {
@@ -9929,10 +9943,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
99299943
else
99309944
Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
99319945

9932-
// If we are loading a partial vector, it does not make sense to adjust
9933-
// the base pointer. This happens with (splat (s_to_v_permuted (ld))).
9934-
if (LD->getMemoryVT().getSizeInBits() == (IsFourByte ? 32 : 64))
9935-
Offset = 0;
99369946
SDValue BasePtr = LD->getBasePtr();
99379947
if (Offset != 0)
99389948
BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,5 +446,93 @@ entry:
446446
ret <16 x i8> %shuffle
447447
}
448448

449+
define dso_local <4 x i32> @testSplat4Low(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 {
450+
; CHECK-P8-LABEL: testSplat4Low:
451+
; CHECK-P8: # %bb.0: # %entry
452+
; CHECK-P8-NEXT: ld r3, 0(r3)
453+
; CHECK-P8-NEXT: mtfprd f0, r3
454+
; CHECK-P8-NEXT: xxspltw v2, vs0, 0
455+
; CHECK-P8-NEXT: blr
456+
;
457+
; CHECK-P9-LABEL: testSplat4Low:
458+
; CHECK-P9: # %bb.0: # %entry
459+
; CHECK-P9-NEXT: addi r3, r3, 4
460+
; CHECK-P9-NEXT: lxvwsx v2, 0, r3
461+
; CHECK-P9-NEXT: blr
462+
;
463+
; CHECK-NOVSX-LABEL: testSplat4Low:
464+
; CHECK-NOVSX: # %bb.0: # %entry
465+
; CHECK-NOVSX-NEXT: ld r3, 0(r3)
466+
; CHECK-NOVSX-NEXT: addi r4, r1, -16
467+
; CHECK-NOVSX-NEXT: std r3, -16(r1)
468+
; CHECK-NOVSX-NEXT: lvx v2, 0, r4
469+
; CHECK-NOVSX-NEXT: vspltw v2, v2, 2
470+
; CHECK-NOVSX-NEXT: blr
471+
entry:
472+
%0 = load <8 x i8>, <8 x i8>* %ptr, align 8
473+
%vecinit18 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
474+
%1 = bitcast <16 x i8> %vecinit18 to <4 x i32>
475+
ret <4 x i32> %1
476+
}
477+
478+
; Function Attrs: norecurse nounwind readonly
479+
define dso_local <4 x i32> @testSplat4hi(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 {
480+
; CHECK-P8-LABEL: testSplat4hi:
481+
; CHECK-P8: # %bb.0: # %entry
482+
; CHECK-P8-NEXT: ld r3, 0(r3)
483+
; CHECK-P8-NEXT: mtfprd f0, r3
484+
; CHECK-P8-NEXT: xxspltw v2, vs0, 1
485+
; CHECK-P8-NEXT: blr
486+
;
487+
; CHECK-P9-LABEL: testSplat4hi:
488+
; CHECK-P9: # %bb.0: # %entry
489+
; CHECK-P9-NEXT: lxvwsx v2, 0, r3
490+
; CHECK-P9-NEXT: blr
491+
;
492+
; CHECK-NOVSX-LABEL: testSplat4hi:
493+
; CHECK-NOVSX: # %bb.0: # %entry
494+
; CHECK-NOVSX-NEXT: ld r3, 0(r3)
495+
; CHECK-NOVSX-NEXT: addi r4, r1, -16
496+
; CHECK-NOVSX-NEXT: std r3, -16(r1)
497+
; CHECK-NOVSX-NEXT: lvx v2, 0, r4
498+
; CHECK-NOVSX-NEXT: vspltw v2, v2, 3
499+
; CHECK-NOVSX-NEXT: blr
500+
entry:
501+
%0 = load <8 x i8>, <8 x i8>* %ptr, align 8
502+
%vecinit22 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
503+
%1 = bitcast <16 x i8> %vecinit22 to <4 x i32>
504+
ret <4 x i32> %1
505+
}
506+
507+
; Function Attrs: norecurse nounwind readonly
508+
define dso_local <2 x i64> @testSplat8(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 {
509+
; CHECK-P8-LABEL: testSplat8:
510+
; CHECK-P8: # %bb.0: # %entry
511+
; CHECK-P8-NEXT: lxvdsx v2, 0, r3
512+
; CHECK-P8-NEXT: blr
513+
;
514+
; CHECK-P9-LABEL: testSplat8:
515+
; CHECK-P9: # %bb.0: # %entry
516+
; CHECK-P9-NEXT: lxvdsx v2, 0, r3
517+
; CHECK-P9-NEXT: blr
518+
;
519+
; CHECK-NOVSX-LABEL: testSplat8:
520+
; CHECK-NOVSX: # %bb.0: # %entry
521+
; CHECK-NOVSX-NEXT: ld r3, 0(r3)
522+
; CHECK-NOVSX-NEXT: addis r4, r2, .LCPI19_0@toc@ha
523+
; CHECK-NOVSX-NEXT: addi r4, r4, .LCPI19_0@toc@l
524+
; CHECK-NOVSX-NEXT: lvx v2, 0, r4
525+
; CHECK-NOVSX-NEXT: std r3, -16(r1)
526+
; CHECK-NOVSX-NEXT: addi r3, r1, -16
527+
; CHECK-NOVSX-NEXT: lvx v3, 0, r3
528+
; CHECK-NOVSX-NEXT: vperm v2, v3, v3, v2
529+
; CHECK-NOVSX-NEXT: blr
530+
entry:
531+
%0 = load <8 x i8>, <8 x i8>* %ptr, align 8
532+
%vecinit30 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
533+
%1 = bitcast <16 x i8> %vecinit30 to <2 x i64>
534+
ret <2 x i64> %1
535+
}
536+
449537
declare double @dummy() local_unnamed_addr
450538
attributes #0 = { nounwind }

0 commit comments

Comments
 (0)