Skip to content

Commit 8b6e9de

Browse files
authored
[PowerPC] improve P10 store forwarding on P7 scalar to vector (#102330)
Try to make P7 code with scalar to vector operations that use store/re-load to run smoother on P10 by supplying enough store width to cover the load and allow hardware store forwarding.
1 parent fbf81e3 commit 8b6e9de

File tree

9 files changed

+361
-203
lines changed

9 files changed

+361
-203
lines changed

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,11 @@ using namespace llvm;
105105

106106
#define DEBUG_TYPE "ppc-lowering"
107107

108+
static cl::opt<bool> DisableP10StoreForward(
109+
"disable-p10-store-forward",
110+
cl::desc("disable P10 store forward-friendly conversion"), cl::Hidden,
111+
cl::init(false));
112+
108113
static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
109114
cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
110115

@@ -985,6 +990,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
985990

986991
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
987992
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
993+
// LE is P8+/64-bit so direct moves are supported and these operations
994+
// are legal. The custom transformation requires 64-bit since we need a
995+
// pair of stores that will cover a 128-bit load for P10.
996+
if (!DisableP10StoreForward && isPPC64 && !Subtarget.isLittleEndian()) {
997+
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Custom);
998+
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
999+
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
1000+
}
9881001

9891002
setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
9901003
setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
@@ -11483,9 +11496,33 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
1148311496
EVT PtrVT = getPointerTy(DAG.getDataLayout());
1148411497
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
1148511498

11499+
SDValue Val = Op.getOperand(0);
11500+
EVT ValVT = Val.getValueType();
11501+
// P10 hardware store forwarding requires that a single store contains all
11502+
// the data for the load. P10 is able to merge a pair of adjacent stores. Try
11503+
// to avoid load hit store on P10 when running binaries compiled for older
11504+
// processors by generating two mergeable scalar stores to forward with the
11505+
// vector load.
11506+
if (!DisableP10StoreForward && Subtarget.isPPC64() &&
11507+
!Subtarget.isLittleEndian() && ValVT.isInteger() &&
11508+
ValVT.getSizeInBits() <= 64) {
11509+
Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, Val);
11510+
EVT ShiftAmountTy = getShiftAmountTy(MVT::i64, DAG.getDataLayout());
11511+
SDValue ShiftBy = DAG.getConstant(
11512+
64 - Op.getValueType().getScalarSizeInBits(), dl, ShiftAmountTy);
11513+
Val = DAG.getNode(ISD::SHL, dl, MVT::i64, Val, ShiftBy);
11514+
SDValue Plus8 =
11515+
DAG.getNode(ISD::ADD, dl, PtrVT, FIdx, DAG.getConstant(8, dl, PtrVT));
11516+
SDValue Store2 =
11517+
DAG.getStore(DAG.getEntryNode(), dl, Val, Plus8, MachinePointerInfo());
11518+
SDValue Store = DAG.getStore(Store2, dl, Val, FIdx, MachinePointerInfo());
11519+
return DAG.getLoad(Op.getValueType(), dl, Store, FIdx,
11520+
MachinePointerInfo());
11521+
}
11522+
1148611523
// Store the input value into Value#0 of the stack slot.
11487-
SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
11488-
MachinePointerInfo());
11524+
SDValue Store =
11525+
DAG.getStore(DAG.getEntryNode(), dl, Val, FIdx, MachinePointerInfo());
1148911526
// Load it out.
1149011527
return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());
1149111528
}

llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ define void @test_aix_splatimm(i32 %arg, i32 %arg1, i32 %arg2) {
1414
; CHECK-AIX-NEXT: slwi 3, 3, 8
1515
; CHECK-AIX-NEXT: neg 3, 3
1616
; CHECK-AIX-NEXT: lwz 6, 0(3)
17-
; CHECK-AIX-NEXT: sth 3, -16(1)
17+
; CHECK-AIX-NEXT: sldi 3, 3, 48
18+
; CHECK-AIX-NEXT: std 3, -16(1)
19+
; CHECK-AIX-NEXT: std 3, -8(1)
1820
; CHECK-AIX-NEXT: addi 3, 1, -16
1921
; CHECK-AIX-NEXT: lxvw4x 34, 0, 3
2022
; CHECK-AIX-NEXT: srwi 3, 4, 16
@@ -24,9 +26,11 @@ define void @test_aix_splatimm(i32 %arg, i32 %arg1, i32 %arg2) {
2426
; CHECK-AIX-NEXT: mullw 3, 3, 4
2527
; CHECK-AIX-NEXT: li 4, 0
2628
; CHECK-AIX-NEXT: neg 3, 3
29+
; CHECK-AIX-NEXT: sldi 3, 3, 48
2730
; CHECK-AIX-NEXT: vsplth 2, 2, 0
2831
; CHECK-AIX-NEXT: stxvw4x 34, 0, 4
29-
; CHECK-AIX-NEXT: sth 3, -32(1)
32+
; CHECK-AIX-NEXT: std 3, -32(1)
33+
; CHECK-AIX-NEXT: std 3, -24(1)
3034
; CHECK-AIX-NEXT: addi 3, 1, -32
3135
; CHECK-AIX-NEXT: lxvw4x 34, 0, 3
3236
; CHECK-AIX-NEXT: vsplth 2, 2, 0

llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll

Lines changed: 24 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -338,17 +338,16 @@ define <4 x i32> @build_v4i32_load_0(ptr nocapture noundef readonly %p) {
338338
; PWR7-BE-LABEL: build_v4i32_load_0:
339339
; PWR7-BE: # %bb.0: # %entry
340340
; PWR7-BE-NEXT: lwz 3, 0(3)
341-
; PWR7-BE-NEXT: li 4, 0
342-
; PWR7-BE-NEXT: stw 4, -16(1)
343-
; PWR7-BE-NEXT: stw 3, -32(1)
341+
; PWR7-BE-NEXT: xxlxor 36, 36, 36
342+
; PWR7-BE-NEXT: sldi 3, 3, 32
343+
; PWR7-BE-NEXT: std 3, -32(1)
344+
; PWR7-BE-NEXT: std 3, -24(1)
344345
; PWR7-BE-NEXT: addis 3, 2, .LCPI8_0@toc@ha
345346
; PWR7-BE-NEXT: addi 3, 3, .LCPI8_0@toc@l
346347
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
347-
; PWR7-BE-NEXT: addi 3, 1, -16
348-
; PWR7-BE-NEXT: lxvw4x 35, 0, 3
349348
; PWR7-BE-NEXT: addi 3, 1, -32
350-
; PWR7-BE-NEXT: lxvw4x 36, 0, 3
351-
; PWR7-BE-NEXT: vperm 2, 4, 3, 2
349+
; PWR7-BE-NEXT: lxvw4x 35, 0, 3
350+
; PWR7-BE-NEXT: vperm 2, 3, 4, 2
352351
; PWR7-BE-NEXT: blr
353352
;
354353
; PWR8-BE-LABEL: build_v4i32_load_0:
@@ -402,17 +401,16 @@ define <4 x i32> @build_v4i32_load_1(ptr nocapture noundef readonly %p) {
402401
; PWR7-BE-LABEL: build_v4i32_load_1:
403402
; PWR7-BE: # %bb.0: # %entry
404403
; PWR7-BE-NEXT: lwz 3, 0(3)
405-
; PWR7-BE-NEXT: li 4, 0
406-
; PWR7-BE-NEXT: stw 4, -32(1)
407-
; PWR7-BE-NEXT: stw 3, -16(1)
404+
; PWR7-BE-NEXT: xxlxor 36, 36, 36
405+
; PWR7-BE-NEXT: sldi 3, 3, 32
406+
; PWR7-BE-NEXT: std 3, -16(1)
407+
; PWR7-BE-NEXT: std 3, -8(1)
408408
; PWR7-BE-NEXT: addis 3, 2, .LCPI9_0@toc@ha
409409
; PWR7-BE-NEXT: addi 3, 3, .LCPI9_0@toc@l
410410
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
411-
; PWR7-BE-NEXT: addi 3, 1, -32
412-
; PWR7-BE-NEXT: lxvw4x 35, 0, 3
413411
; PWR7-BE-NEXT: addi 3, 1, -16
414-
; PWR7-BE-NEXT: lxvw4x 36, 0, 3
415-
; PWR7-BE-NEXT: vperm 2, 3, 4, 2
412+
; PWR7-BE-NEXT: lxvw4x 35, 0, 3
413+
; PWR7-BE-NEXT: vperm 2, 4, 3, 2
416414
; PWR7-BE-NEXT: blr
417415
;
418416
; PWR8-BE-LABEL: build_v4i32_load_1:
@@ -466,17 +464,16 @@ define <4 x i32> @build_v4i32_load_2(ptr nocapture noundef readonly %p) {
466464
; PWR7-BE-LABEL: build_v4i32_load_2:
467465
; PWR7-BE: # %bb.0: # %entry
468466
; PWR7-BE-NEXT: lwz 3, 0(3)
469-
; PWR7-BE-NEXT: li 4, 0
470-
; PWR7-BE-NEXT: stw 4, -32(1)
471-
; PWR7-BE-NEXT: stw 3, -16(1)
467+
; PWR7-BE-NEXT: xxlxor 36, 36, 36
468+
; PWR7-BE-NEXT: sldi 3, 3, 32
469+
; PWR7-BE-NEXT: std 3, -16(1)
470+
; PWR7-BE-NEXT: std 3, -8(1)
472471
; PWR7-BE-NEXT: addis 3, 2, .LCPI10_0@toc@ha
473472
; PWR7-BE-NEXT: addi 3, 3, .LCPI10_0@toc@l
474473
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
475-
; PWR7-BE-NEXT: addi 3, 1, -32
476-
; PWR7-BE-NEXT: lxvw4x 35, 0, 3
477474
; PWR7-BE-NEXT: addi 3, 1, -16
478-
; PWR7-BE-NEXT: lxvw4x 36, 0, 3
479-
; PWR7-BE-NEXT: vperm 2, 3, 4, 2
475+
; PWR7-BE-NEXT: lxvw4x 35, 0, 3
476+
; PWR7-BE-NEXT: vperm 2, 4, 3, 2
480477
; PWR7-BE-NEXT: blr
481478
;
482479
; PWR8-BE-LABEL: build_v4i32_load_2:
@@ -530,17 +527,16 @@ define <4 x i32> @build_v4i32_load_3(ptr nocapture noundef readonly %p) {
530527
; PWR7-BE-LABEL: build_v4i32_load_3:
531528
; PWR7-BE: # %bb.0: # %entry
532529
; PWR7-BE-NEXT: lwz 3, 0(3)
533-
; PWR7-BE-NEXT: li 4, 0
534-
; PWR7-BE-NEXT: stw 4, -32(1)
535-
; PWR7-BE-NEXT: stw 3, -16(1)
530+
; PWR7-BE-NEXT: xxlxor 36, 36, 36
531+
; PWR7-BE-NEXT: sldi 3, 3, 32
532+
; PWR7-BE-NEXT: std 3, -16(1)
533+
; PWR7-BE-NEXT: std 3, -8(1)
536534
; PWR7-BE-NEXT: addis 3, 2, .LCPI11_0@toc@ha
537535
; PWR7-BE-NEXT: addi 3, 3, .LCPI11_0@toc@l
538536
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
539-
; PWR7-BE-NEXT: addi 3, 1, -32
540-
; PWR7-BE-NEXT: lxvw4x 35, 0, 3
541537
; PWR7-BE-NEXT: addi 3, 1, -16
542-
; PWR7-BE-NEXT: lxvw4x 36, 0, 3
543-
; PWR7-BE-NEXT: vperm 2, 3, 4, 2
538+
; PWR7-BE-NEXT: lxvw4x 35, 0, 3
539+
; PWR7-BE-NEXT: vperm 2, 4, 3, 2
544540
; PWR7-BE-NEXT: blr
545541
;
546542
; PWR8-BE-LABEL: build_v4i32_load_3:

llvm/test/CodeGen/PowerPC/load-and-splat.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,7 @@ define <16 x i8> @adjusted_lxvwsx(ptr %s, ptr %t) {
591591
; P7: # %bb.0: # %entry
592592
; P7-NEXT: ld r3, 0(r3)
593593
; P7-NEXT: std r3, -16(r1)
594+
; P7-NEXT: std r3, -8(r1)
594595
; P7-NEXT: addi r3, r1, -16
595596
; P7-NEXT: lxvw4x vs0, 0, r3
596597
; P7-NEXT: xxspltw v2, vs0, 1

0 commit comments

Comments
 (0)