diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp index 18350650bfe2d..977fc177a3002 100644 --- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -797,6 +797,7 @@ bool PPCMIPeephole::simplifyCode() { case PPC::VSPLTH: case PPC::XXSPLTW: { unsigned MyOpcode = MI.getOpcode(); + // The operand number of the source register in the splat instruction. unsigned OpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2; Register TrueReg = TRI->lookThruCopyLike(MI.getOperand(OpNo).getReg(), MRI); @@ -823,6 +824,7 @@ bool PPCMIPeephole::simplifyCode() { (MyOpcode == PPC::XXSPLTW && DefOpcode == PPC::LXVWSX) || (MyOpcode == PPC::XXSPLTW && DefOpcode == PPC::MTVSRWS)|| (MyOpcode == PPC::XXSPLTW && isConvertOfSplat()); + // If the instruction[s] that feed this splat have already splat // the value, this splat is redundant. if (AlreadySplat) { @@ -835,30 +837,58 @@ bool PPCMIPeephole::simplifyCode() { ToErase = &MI; Simplified = true; } + // Splat fed by a shift. Usually when we align value to splat into // vector element zero. if (DefOpcode == PPC::XXSLDWI) { - Register ShiftRes = DefMI->getOperand(0).getReg(); Register ShiftOp1 = DefMI->getOperand(1).getReg(); Register ShiftOp2 = DefMI->getOperand(2).getReg(); - unsigned ShiftImm = DefMI->getOperand(3).getImm(); - unsigned SplatImm = - MI.getOperand(MyOpcode == PPC::XXSPLTW ? 2 : 1).getImm(); + if (ShiftOp1 == ShiftOp2) { - unsigned NewElem = (SplatImm + ShiftImm) & 0x3; + // For example, We can erase XXSLDWI from in following: + // %2:vrrc = XXSLDWI killed %1:vrrc, %1:vrrc, 1 + // %6:vrrc = VSPLTB 15, killed %2:vrrc + // %7:vsrc = XXLAND killed %6:vrrc, killed %1:vrrc + // + // ---> + // + // %6:vrrc = VSPLTB 3, killed %1:vrrc + // %7:vsrc = XXLAND killed %6:vrrc, killed %1:vrrc + + Register ShiftRes = DefMI->getOperand(0).getReg(); if (MRI->hasOneNonDBGUse(ShiftRes)) { LLVM_DEBUG(dbgs() << "Removing redundant shift: "); LLVM_DEBUG(DefMI->dump()); ToErase = DefMI; } Simplified = true; + unsigned ShiftImm = DefMI->getOperand(3).getImm(); + // The operand number of the splat Imm in the instruction. + unsigned SplatImmNo = MyOpcode == PPC::XXSPLTW ? 2 : 1; + unsigned SplatImm = MI.getOperand(SplatImmNo).getImm(); + + // Calculate the new splat-element immediate. We need to convert the + // element index into the proper unit (byte for VSPLTB, halfword for + // VSPLTH, word for VSPLTW) because PPC::XXSLDWI interprets its + // ShiftImm in 32-bit word units. + auto CalculateNewElementIdx = [&](unsigned Opcode) { + if (Opcode == PPC::VSPLTB) + return (SplatImm + ShiftImm * 4) & 0xF; + else if (Opcode == PPC::VSPLTH) + return (SplatImm + ShiftImm * 2) & 0x7; + else + return (SplatImm + ShiftImm) & 0x3; + }; + + unsigned NewElem = CalculateNewElementIdx(MyOpcode); + LLVM_DEBUG(dbgs() << "Changing splat immediate from " << SplatImm << " to " << NewElem << " in instruction: "); LLVM_DEBUG(MI.dump()); addRegToUpdate(MI.getOperand(OpNo).getReg()); addRegToUpdate(ShiftOp1); MI.getOperand(OpNo).setReg(ShiftOp1); - MI.getOperand(2).setImm(NewElem); + MI.getOperand(SplatImmNo).setImm(NewElem); } } break; diff --git a/llvm/test/CodeGen/PowerPC/splat-after-xxsldwi.ll b/llvm/test/CodeGen/PowerPC/splat-after-xxsldwi.ll new file mode 100644 index 0000000000000..d0e96e3fbbf56 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/splat-after-xxsldwi.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names -mtriple=powerpc64-ibm-aix < %s | \ +; RUN: FileCheck %s + + +define <4 x i8> @backsmith_pure_1(<8 x i32> %0) { +; CHECK-LABEL: backsmith_pure_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld r3, L..C0(r2) # %const.0 +; CHECK-NEXT: xxsldwi vs34, vs35, vs35, 1 +; CHECK-NEXT: lxvw4x vs36, 0, r3 +; CHECK-NEXT: vspltb v3, v3, 3 +; CHECK-NEXT: vperm v2, v2, v2, v4 +; CHECK-NEXT: xxland vs34, vs35, vs34 +; CHECK-NEXT: blr +entry: + %shuffle = shufflevector <8 x i32> %0, <8 x i32> zeroinitializer, <4 x i32> + %conv4 = trunc <4 x i32> %shuffle to <4 x i8> + %shift = shufflevector <4 x i8> %conv4, <4 x i8> zeroinitializer, <4 x i32> + %foldExtExtBinop = and <4 x i8> %shift, %conv4 + ret <4 x i8> %foldExtExtBinop +}