@@ -3549,6 +3549,176 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
35493549 return true ;
35503550}
35513551
3552+ // Match BITOP3 operation and return a number of matched instructions plus
3553+ // truth table.
3554+ static std::pair<unsigned , uint8_t > BitOp3_Op (SDValue In,
3555+ SmallVectorImpl<SDValue> &Src) {
3556+ unsigned NumOpcodes = 0 ;
3557+ uint8_t LHSBits, RHSBits;
3558+
3559+ auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
3560+ // Define truth table given Src0, Src1, Src2 bits permutations:
3561+ // 0 0 0
3562+ // 0 0 1
3563+ // 0 1 0
3564+ // 0 1 1
3565+ // 1 0 0
3566+ // 1 0 1
3567+ // 1 1 0
3568+ // 1 1 1
3569+ const uint8_t SrcBits[3 ] = { 0xf0 , 0xcc , 0xaa };
3570+
3571+ if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
3572+ if (C->isAllOnes ()) {
3573+ Bits = 0xff ;
3574+ return true ;
3575+ }
3576+ if (C->isZero ()) {
3577+ Bits = 0 ;
3578+ return true ;
3579+ }
3580+ }
3581+
3582+ for (unsigned I = 0 ; I < Src.size (); ++I) {
3583+ // Try to find existing reused operand
3584+ if (Src[I] == Op) {
3585+ Bits = SrcBits[I];
3586+ return true ;
3587+ }
3588+ // Try to replace parent operator
3589+ if (Src[I] == In) {
3590+ Bits = SrcBits[I];
3591+ Src[I] = Op;
3592+ return true ;
3593+ }
3594+ }
3595+
3596+ if (Src.size () == 3 ) {
3597+ // No room left for operands. Try one last time, there can be a 'not' of
3598+ // one of our source operands. In this case we can compute the bits
3599+ // without growing Src vector.
3600+ if (Op.getOpcode () == ISD::XOR) {
3601+ if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand (1 ))) {
3602+ if (C->isAllOnes ()) {
3603+ SDValue LHS = Op.getOperand (0 );
3604+ for (unsigned I = 0 ; I < Src.size (); ++I) {
3605+ if (Src[I] == LHS) {
3606+ Bits = ~SrcBits[I];
3607+ return true ;
3608+ }
3609+ }
3610+ }
3611+ }
3612+ }
3613+
3614+ return false ;
3615+ }
3616+
3617+ Bits = SrcBits[Src.size ()];
3618+ Src.push_back (Op);
3619+ return true ;
3620+ };
3621+
3622+ switch (In.getOpcode ()) {
3623+ case ISD::AND:
3624+ case ISD::OR:
3625+ case ISD::XOR: {
3626+ SDValue LHS = In.getOperand (0 );
3627+ SDValue RHS = In.getOperand (1 );
3628+
3629+ SmallVector<SDValue, 3 > Backup (Src.begin (), Src.end ());
3630+ if (!getOperandBits (LHS, LHSBits) ||
3631+ !getOperandBits (RHS, RHSBits)) {
3632+ Src = Backup;
3633+ return std::make_pair (0 , 0 );
3634+ }
3635+
3636+ // Recursion is naturally limited by the size of the operand vector.
3637+ auto Op = BitOp3_Op (LHS, Src);
3638+ if (Op.first ) {
3639+ NumOpcodes += Op.first ;
3640+ LHSBits = Op.second ;
3641+ }
3642+
3643+ Op = BitOp3_Op (RHS, Src);
3644+ if (Op.first ) {
3645+ NumOpcodes += Op.first ;
3646+ RHSBits = Op.second ;
3647+ }
3648+ break ;
3649+ }
3650+ default :
3651+ return std::make_pair (0 , 0 );
3652+ }
3653+
3654+ uint8_t TTbl;
3655+ switch (In.getOpcode ()) {
3656+ case ISD::AND:
3657+ TTbl = LHSBits & RHSBits;
3658+ break ;
3659+ case ISD::OR:
3660+ TTbl = LHSBits | RHSBits;
3661+ break ;
3662+ case ISD::XOR:
3663+ TTbl = LHSBits ^ RHSBits;
3664+ break ;
3665+ default :
3666+ break ;
3667+ }
3668+
3669+ return std::make_pair (NumOpcodes + 1 , TTbl);
3670+ }
3671+
3672+ bool AMDGPUDAGToDAGISel::SelectBITOP3 (SDValue In, SDValue &Src0, SDValue &Src1,
3673+ SDValue &Src2, SDValue &Tbl) const {
3674+ SmallVector<SDValue, 3 > Src;
3675+ uint8_t TTbl;
3676+ unsigned NumOpcodes;
3677+
3678+ std::tie (NumOpcodes, TTbl) = BitOp3_Op (In, Src);
3679+
3680+ // Src.empty() case can happen if all operands are all zero or all ones.
3681+ // Normally it shall be optimized out before reaching this.
3682+ if (NumOpcodes < 2 || Src.empty ())
3683+ return false ;
3684+
3685+ // For a uniform case threshold should be higher to account for moves between
3686+ // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
3687+ // and a readtfirstlane after.
3688+ if (NumOpcodes < 4 && !In->isDivergent ())
3689+ return false ;
3690+
3691+ if (NumOpcodes == 2 && In.getValueType () == MVT::i32 ) {
3692+ // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3693+ // asm more readable. This cannot be modeled with AddedComplexity because
3694+ // selector does not know how many operations did we match.
3695+ if ((In.getOpcode () == ISD::XOR || In.getOpcode () == ISD::OR) &&
3696+ (In.getOperand (0 ).getOpcode () == In.getOpcode () ||
3697+ In.getOperand (1 ).getOpcode () == In.getOpcode ()))
3698+ return false ;
3699+
3700+ if (In.getOpcode () == ISD::OR &&
3701+ (In.getOperand (0 ).getOpcode () == ISD::AND ||
3702+ In.getOperand (1 ).getOpcode () == ISD::AND))
3703+ return false ;
3704+ }
3705+
3706+ // Last operand can be ignored, turning a ternary operation into a binary.
3707+ // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3708+ // 'c' with 'a' here without changing the answer. In some pathological
3709+ // cases it should be possible to get an operation with a single operand
3710+ // too if optimizer would not catch it.
3711+ while (Src.size () < 3 )
3712+ Src.push_back (Src[0 ]);
3713+
3714+ Src0 = Src[0 ];
3715+ Src1 = Src[1 ];
3716+ Src2 = Src[2 ];
3717+
3718+ Tbl = CurDAG->getTargetConstant (TTbl, SDLoc (In), MVT::i32 );
3719+ return true ;
3720+ }
3721+
35523722SDValue AMDGPUDAGToDAGISel::getHi16Elt (SDValue In) const {
35533723 if (In.isUndef ())
35543724 return CurDAG->getUNDEF (MVT::i32 );
0 commit comments