diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 4ce8c508c5f2b..f2757c5e49b33 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -113,6 +113,9 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { if (tryFence(N)) return; break; + case NVPTXISD::UNPACK_VECTOR: + tryUNPACK_VECTOR(N); + return; case ISD::EXTRACT_VECTOR_ELT: if (tryEXTRACT_VECTOR_ELEMENT(N)) return; @@ -445,6 +448,17 @@ bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) { return true; } +bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(SDNode *N) { + SDValue Vector = N->getOperand(0); + MVT EltVT = N->getSimpleValueType(0); + + MachineSDNode *N2 = + CurDAG->getMachineNode(NVPTX::I64toV2I32, SDLoc(N), EltVT, EltVT, Vector); + + ReplaceNode(N, N2); + return true; +} + // Find all instances of extract_vector_elt that use this v2f16 vector // and coalesce them into a scattering move instruction. bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 42891b8ca8d8d..23cbd458571a0 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -88,6 +88,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { bool tryConstantFP(SDNode *N); bool SelectSETP_F16X2(SDNode *N); bool SelectSETP_BF16X2(SDNode *N); + bool tryUNPACK_VECTOR(SDNode *N); bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N); void SelectV2I64toI128(SDNode *N); void SelectI128toV2I64(SDNode *N); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index b768725b04256..6a02f478206d5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -668,8 +669,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64}, Expand); - if (STI.hasHWROT32()) + if (STI.hasHWROT32()) { setOperationAction({ISD::FSHL, ISD::FSHR}, MVT::i32, Legal); + setOperationAction({ISD::ROTL, ISD::ROTR, ISD::FSHL, ISD::FSHR}, MVT::i64, + Custom); + } setOperationAction(ISD::BSWAP, MVT::i16, Expand); @@ -1056,6 +1060,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(NVPTXISD::StoreRetvalV2) MAKE_CASE(NVPTXISD::StoreRetvalV4) MAKE_CASE(NVPTXISD::PseudoUseParam) + MAKE_CASE(NVPTXISD::UNPACK_VECTOR) + MAKE_CASE(NVPTXISD::BUILD_VECTOR) MAKE_CASE(NVPTXISD::RETURN) MAKE_CASE(NVPTXISD::CallSeqBegin) MAKE_CASE(NVPTXISD::CallSeqEnd) @@ -2758,6 +2764,61 @@ static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg); } +static SDValue expandFSH64(SDValue A, SDValue B, SDValue ShiftAmount, SDLoc DL, + unsigned Opcode, SelectionDAG &DAG) { + assert(A.getValueType() == MVT::i64 && B.getValueType() == MVT::i64); + + const auto *AmtConst = dyn_cast(ShiftAmount); + if (!AmtConst) + return SDValue(); + const auto Amt = AmtConst->getZExtValue() & 63; + + SDValue UnpackA = + DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, A); + SDValue UnpackB = + DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, B); + + // Arch is Little endiain: 0 = low bits, 1 = high bits + SDValue ALo = UnpackA.getValue(0); + SDValue AHi = UnpackA.getValue(1); + SDValue BLo = UnpackB.getValue(0); + SDValue BHi = UnpackB.getValue(1); + + // The bitfeild consists of { AHi : ALo : BHi : BLo } + // + // * FSHL, Amt < 32 - The window will contain { AHi : ALo : BHi } + // * FSHL, Amt >= 32 - The window will contain { ALo : BHi : BLo } + // * FSHR, Amt < 32 - The window will contain { ALo : BHi : BLo } + // * FSHR, Amt >= 32 - The window will contain { AHi : ALo : BHi } + // + // Note that Amt = 0 and Amt = 32 are special cases where 32-bit funnel shifts + // are not needed at all. Amt = 0 is a no-op producing either A or B depending + // on the direction. Amt = 32 can be implemented by a packing and unpacking + // move to select and arrange the 32bit values. For simplicity, these cases + // are not handled here explicitly and instead we rely on DAGCombiner to + // remove the no-op funnel shifts we insert. + auto [High, Mid, Low] = ((Opcode == ISD::FSHL) == (Amt < 32)) + ? std::make_tuple(AHi, ALo, BHi) + : std::make_tuple(ALo, BHi, BLo); + + SDValue NewAmt = DAG.getConstant(Amt & 31, DL, MVT::i32); + SDValue RHi = DAG.getNode(Opcode, DL, MVT::i32, {High, Mid, NewAmt}); + SDValue RLo = DAG.getNode(Opcode, DL, MVT::i32, {Mid, Low, NewAmt}); + + return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, {RLo, RHi}); +} + +static SDValue lowerFSH(SDValue Op, SelectionDAG &DAG) { + return expandFSH64(Op->getOperand(0), Op->getOperand(1), Op->getOperand(2), + SDLoc(Op), Op->getOpcode(), DAG); +} + +static SDValue lowerROT(SDValue Op, SelectionDAG &DAG) { + unsigned Opcode = Op->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR; + return expandFSH64(Op->getOperand(0), Op->getOperand(0), Op->getOperand(1), + SDLoc(Op), Opcode, DAG); +} + SDValue NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -2818,6 +2879,12 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerVAARG(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); + case ISD::FSHL: + case ISD::FSHR: + return lowerFSH(Op, DAG); + case ISD::ROTL: + case ISD::ROTR: + return lowerROT(Op, DAG); case ISD::ABS: case ISD::SMIN: case ISD::SMAX: diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index ff0241886223b..39470be254efa 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -61,6 +61,17 @@ enum NodeType : unsigned { BFE, BFI, PRMT, + + /// This node is similar to ISD::BUILD_VECTOR except that the output may be + /// implicitly bitcast to a scalar. This allows for the representation of + /// packing move instructions for vector types which are not legal i.e. v2i32 + BUILD_VECTOR, + + /// This node is the inverse of NVPTX::BUILD_VECTOR. It takes a single value + /// which may be a scalar and unpacks it into multiple values by implicitly + /// converting it to a vector. + UNPACK_VECTOR, + FCOPYSIGN, DYNAMIC_STACKALLOC, STACKRESTORE, diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 3c88551d7b23c..83509b1078c57 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -3222,6 +3222,12 @@ def : Pat<(v2i16 (build_vector i16:$a, i16:$b)), def: Pat<(v2i16 (scalar_to_vector i16:$a)), (CVT_u32_u16 $a, CvtNONE)>; + +def nvptx_build_vector : SDNode<"NVPTXISD::BUILD_VECTOR", SDTypeProfile<1, 2, []>, []>; + +def : Pat<(i64 (nvptx_build_vector i32:$a, i32:$b)), + (V2I32toI64 $a, $b)>; + // // Funnel-Shift // diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll index 4174fd2f3ec2c..f77fb4115567b 100644 --- a/llvm/test/CodeGen/NVPTX/rotate.ll +++ b/llvm/test/CodeGen/NVPTX/rotate.ll @@ -15,8 +15,6 @@ declare i32 @llvm.fshl.i32(i32, i32, i32) declare i32 @llvm.fshr.i32(i32, i32, i32) -; SM20: rotate32 -; SM35: rotate32 define i32 @rotate32(i32 %a, i32 %b) { ; SM20-LABEL: rotate32( ; SM20: { @@ -48,8 +46,6 @@ define i32 @rotate32(i32 %a, i32 %b) { ret i32 %val } -; SM20: rotate64 -; SM35: rotate64 define i64 @rotate64(i64 %a, i32 %b) { ; SM20-LABEL: rotate64( ; SM20: { @@ -88,8 +84,6 @@ define i64 @rotate64(i64 %a, i32 %b) { ret i64 %val } -; SM20: rotateright64 -; SM35: rotateright64 define i64 @rotateright64(i64 %a, i32 %b) { ; SM20-LABEL: rotateright64( ; SM20: { @@ -128,8 +122,6 @@ define i64 @rotateright64(i64 %a, i32 %b) { ret i64 %val } -; SM20: rotl0 -; SM35: rotl0 define i32 @rotl0(i32 %x) { ; SM20-LABEL: rotl0( ; SM20: { @@ -158,7 +150,6 @@ define i32 @rotl0(i32 %x) { ret i32 %t2 } -; SM35: rotl64 define i64 @rotl64(i64 %a, i64 %n) { ; SM20-LABEL: rotl64( ; SM20: { @@ -197,36 +188,94 @@ define i64 @rotl64(i64 %a, i64 %n) { ret i64 %val } -; SM35: rotl64_imm -define i64 @rotl64_imm(i64 %a) { -; SM20-LABEL: rotl64_imm( +define i64 @rotl64_low_imm(i64 %a) { +; SM20-LABEL: rotl64_low_imm( ; SM20: { ; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0]; +; SM20-NEXT: ld.param.u64 %rd1, [rotl64_low_imm_param_0]; ; SM20-NEXT: shr.u64 %rd2, %rd1, 62; ; SM20-NEXT: shl.b64 %rd3, %rd1, 2; ; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; ; SM20-NEXT: st.param.b64 [func_retval0], %rd4; ; SM20-NEXT: ret; ; -; SM35-LABEL: rotl64_imm( +; SM35-LABEL: rotl64_low_imm( ; SM35: { -; SM35-NEXT: .reg .b64 %rd<5>; +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<3>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0]; -; SM35-NEXT: shr.u64 %rd2, %rd1, 62; -; SM35-NEXT: shl.b64 %rd3, %rd1, 2; -; SM35-NEXT: or.b64 %rd4, %rd3, %rd2; -; SM35-NEXT: st.param.b64 [func_retval0], %rd4; +; SM35-NEXT: ld.param.u64 %rd1, [rotl64_low_imm_param_0]; +; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; +; SM35-NEXT: shf.l.wrap.b32 %r3, %r1, %r2, 2; +; SM35-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, 2; +; SM35-NEXT: mov.b64 %rd2, {%r4, %r3}; +; SM35-NEXT: st.param.b64 [func_retval0], %rd2; ; SM35-NEXT: ret; %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66) ret i64 %val } -; SM35: rotr64 +define i64 @rotl64_high_imm(i64 %a) { +; SM20-LABEL: rotl64_high_imm( +; SM20: { +; SM20-NEXT: .reg .b64 %rd<5>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [rotl64_high_imm_param_0]; +; SM20-NEXT: shr.u64 %rd2, %rd1, 1; +; SM20-NEXT: shl.b64 %rd3, %rd1, 63; +; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; +; SM20-NEXT: st.param.b64 [func_retval0], %rd4; +; SM20-NEXT: ret; +; +; SM35-LABEL: rotl64_high_imm( +; SM35: { +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [rotl64_high_imm_param_0]; +; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; +; SM35-NEXT: shf.l.wrap.b32 %r3, %r2, %r1, 31; +; SM35-NEXT: shf.l.wrap.b32 %r4, %r1, %r2, 31; +; SM35-NEXT: mov.b64 %rd2, {%r4, %r3}; +; SM35-NEXT: st.param.b64 [func_retval0], %rd2; +; SM35-NEXT: ret; + %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 63) + ret i64 %val +} + +define i64 @rotl64_32_imm(i64 %a) { +; SM20-LABEL: rotl64_32_imm( +; SM20: { +; SM20-NEXT: .reg .b64 %rd<5>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [rotl64_32_imm_param_0]; +; SM20-NEXT: shr.u64 %rd2, %rd1, 32; +; SM20-NEXT: shl.b64 %rd3, %rd1, 32; +; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; +; SM20-NEXT: st.param.b64 [func_retval0], %rd4; +; SM20-NEXT: ret; +; +; SM35-LABEL: rotl64_32_imm( +; SM35: { +; SM35-NEXT: .reg .b32 %r<3>; +; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [rotl64_32_imm_param_0]; +; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; +; SM35-NEXT: mov.b64 %rd2, {%r2, %r1}; +; SM35-NEXT: st.param.b64 [func_retval0], %rd2; +; SM35-NEXT: ret; + %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 32) + ret i64 %val +} + define i64 @rotr64(i64 %a, i64 %n) { ; SM20-LABEL: rotr64( ; SM20: { @@ -265,32 +314,91 @@ define i64 @rotr64(i64 %a, i64 %n) { ret i64 %val } -; SM35: rotr64_imm -define i64 @rotr64_imm(i64 %a) { -; SM20-LABEL: rotr64_imm( +define i64 @rotr64_low_imm(i64 %a) { +; SM20-LABEL: rotr64_low_imm( ; SM20: { ; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0]; -; SM20-NEXT: shl.b64 %rd2, %rd1, 62; -; SM20-NEXT: shr.u64 %rd3, %rd1, 2; +; SM20-NEXT: ld.param.u64 %rd1, [rotr64_low_imm_param_0]; +; SM20-NEXT: shl.b64 %rd2, %rd1, 52; +; SM20-NEXT: shr.u64 %rd3, %rd1, 12; ; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; ; SM20-NEXT: st.param.b64 [func_retval0], %rd4; ; SM20-NEXT: ret; ; -; SM35-LABEL: rotr64_imm( +; SM35-LABEL: rotr64_low_imm( ; SM35: { -; SM35-NEXT: .reg .b64 %rd<5>; +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<3>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0]; -; SM35-NEXT: shl.b64 %rd2, %rd1, 62; -; SM35-NEXT: shr.u64 %rd3, %rd1, 2; -; SM35-NEXT: or.b64 %rd4, %rd3, %rd2; -; SM35-NEXT: st.param.b64 [func_retval0], %rd4; +; SM35-NEXT: ld.param.u64 %rd1, [rotr64_low_imm_param_0]; +; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; +; SM35-NEXT: shf.r.wrap.b32 %r3, %r2, %r1, 12; +; SM35-NEXT: shf.r.wrap.b32 %r4, %r1, %r2, 12; +; SM35-NEXT: mov.b64 %rd2, {%r4, %r3}; +; SM35-NEXT: st.param.b64 [func_retval0], %rd2; ; SM35-NEXT: ret; - %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66) + %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 12) + ret i64 %val +} + +define i64 @rotr64_high_imm(i64 %a) { +; SM20-LABEL: rotr64_high_imm( +; SM20: { +; SM20-NEXT: .reg .b64 %rd<5>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [rotr64_high_imm_param_0]; +; SM20-NEXT: shl.b64 %rd2, %rd1, 21; +; SM20-NEXT: shr.u64 %rd3, %rd1, 43; +; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; +; SM20-NEXT: st.param.b64 [func_retval0], %rd4; +; SM20-NEXT: ret; +; +; SM35-LABEL: rotr64_high_imm( +; SM35: { +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [rotr64_high_imm_param_0]; +; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; +; SM35-NEXT: shf.r.wrap.b32 %r3, %r1, %r2, 11; +; SM35-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, 11; +; SM35-NEXT: mov.b64 %rd2, {%r4, %r3}; +; SM35-NEXT: st.param.b64 [func_retval0], %rd2; +; SM35-NEXT: ret; + %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 43) + ret i64 %val +} + +define i64 @rotr64_32_imm(i64 %a) { +; SM20-LABEL: rotr64_32_imm( +; SM20: { +; SM20-NEXT: .reg .b64 %rd<5>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [rotr64_32_imm_param_0]; +; SM20-NEXT: shl.b64 %rd2, %rd1, 32; +; SM20-NEXT: shr.u64 %rd3, %rd1, 32; +; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; +; SM20-NEXT: st.param.b64 [func_retval0], %rd4; +; SM20-NEXT: ret; +; +; SM35-LABEL: rotr64_32_imm( +; SM35: { +; SM35-NEXT: .reg .b32 %r<3>; +; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [rotr64_32_imm_param_0]; +; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; +; SM35-NEXT: mov.b64 %rd2, {%r2, %r1}; +; SM35-NEXT: st.param.b64 [func_retval0], %rd2; +; SM35-NEXT: ret; + %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 32) ret i64 %val } @@ -446,3 +554,194 @@ define i64 @funnel_shift_left_64(i64 %a, i64 %b, i64 %c) { ret i64 %val } +define i64 @fshl64_low_imm(i64 %a, i64 %b) { +; SM20-LABEL: fshl64_low_imm( +; SM20: { +; SM20-NEXT: .reg .b64 %rd<6>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [fshl64_low_imm_param_0]; +; SM20-NEXT: ld.param.u64 %rd2, [fshl64_low_imm_param_1]; +; SM20-NEXT: shr.u64 %rd3, %rd2, 59; +; SM20-NEXT: shl.b64 %rd4, %rd1, 5; +; SM20-NEXT: or.b64 %rd5, %rd4, %rd3; +; SM20-NEXT: st.param.b64 [func_retval0], %rd5; +; SM20-NEXT: ret; +; +; SM35-LABEL: fshl64_low_imm( +; SM35: { +; SM35-NEXT: .reg .b32 %r<7>; +; SM35-NEXT: .reg .b64 %rd<4>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [fshl64_low_imm_param_0]; +; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; +; SM35-NEXT: ld.param.u64 %rd2, [fshl64_low_imm_param_1]; +; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2; +; SM35-NEXT: shf.l.wrap.b32 %r5, %r4, %r1, 5; +; SM35-NEXT: shf.l.wrap.b32 %r6, %r1, %r2, 5; +; SM35-NEXT: mov.b64 %rd3, {%r5, %r6}; +; SM35-NEXT: st.param.b64 [func_retval0], %rd3; +; SM35-NEXT: ret; + %val = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 5) + ret i64 %val +} + +define i64 @fshl64_high_imm(i64 %a, i64 %b) { +; SM20-LABEL: fshl64_high_imm( +; SM20: { +; SM20-NEXT: .reg .b64 %rd<6>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [fshl64_high_imm_param_0]; +; SM20-NEXT: ld.param.u64 %rd2, [fshl64_high_imm_param_1]; +; SM20-NEXT: shr.u64 %rd3, %rd2, 9; +; SM20-NEXT: shl.b64 %rd4, %rd1, 55; +; SM20-NEXT: or.b64 %rd5, %rd4, %rd3; +; SM20-NEXT: st.param.b64 [func_retval0], %rd5; +; SM20-NEXT: ret; +; +; SM35-LABEL: fshl64_high_imm( +; SM35: { +; SM35-NEXT: .reg .b32 %r<7>; +; SM35-NEXT: .reg .b64 %rd<4>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [fshl64_high_imm_param_0]; +; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; +; SM35-NEXT: ld.param.u64 %rd2, [fshl64_high_imm_param_1]; +; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2; +; SM35-NEXT: shf.l.wrap.b32 %r5, %r4, %r1, 23; +; SM35-NEXT: shf.l.wrap.b32 %r6, %r3, %r4, 23; +; SM35-NEXT: mov.b64 %rd3, {%r6, %r5}; +; SM35-NEXT: st.param.b64 [func_retval0], %rd3; +; SM35-NEXT: ret; + %val = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 55) + ret i64 %val +} + +define i64 @fshl64_32_imm(i64 %a, i64 %b) { +; SM20-LABEL: fshl64_32_imm( +; SM20: { +; SM20-NEXT: .reg .b64 %rd<5>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [fshl64_32_imm_param_0]; +; SM20-NEXT: shl.b64 %rd2, %rd1, 32; +; SM20-NEXT: ld.param.u32 %rd3, [fshl64_32_imm_param_1+4]; +; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; +; SM20-NEXT: st.param.b64 [func_retval0], %rd4; +; SM20-NEXT: ret; +; +; SM35-LABEL: fshl64_32_imm( +; SM35: { +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<4>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [fshl64_32_imm_param_0]; +; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; +; SM35-NEXT: ld.param.u64 %rd2, [fshl64_32_imm_param_1]; +; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2; +; SM35-NEXT: mov.b64 %rd3, {%r4, %r1}; +; SM35-NEXT: st.param.b64 [func_retval0], %rd3; +; SM35-NEXT: ret; + %val = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 32) + ret i64 %val +} + +define i64 @fshr64_low_imm(i64 %a, i64 %b) { +; SM20-LABEL: fshr64_low_imm( +; SM20: { +; SM20-NEXT: .reg .b64 %rd<6>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [fshr64_low_imm_param_0]; +; SM20-NEXT: ld.param.u64 %rd2, [fshr64_low_imm_param_1]; +; SM20-NEXT: shr.u64 %rd3, %rd2, 31; +; SM20-NEXT: shl.b64 %rd4, %rd1, 33; +; SM20-NEXT: or.b64 %rd5, %rd4, %rd3; +; SM20-NEXT: st.param.b64 [func_retval0], %rd5; +; SM20-NEXT: ret; +; +; SM35-LABEL: fshr64_low_imm( +; SM35: { +; SM35-NEXT: .reg .b32 %r<7>; +; SM35-NEXT: .reg .b64 %rd<4>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [fshr64_low_imm_param_0]; +; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; +; SM35-NEXT: ld.param.u64 %rd2, [fshr64_low_imm_param_1]; +; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2; +; SM35-NEXT: shf.r.wrap.b32 %r5, %r4, %r1, 31; +; SM35-NEXT: shf.r.wrap.b32 %r6, %r3, %r4, 31; +; SM35-NEXT: mov.b64 %rd3, {%r6, %r5}; +; SM35-NEXT: st.param.b64 [func_retval0], %rd3; +; SM35-NEXT: ret; + %val = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 31) + ret i64 %val +} + +define i64 @fshr64_high_imm(i64 %a, i64 %b) { +; SM20-LABEL: fshr64_high_imm( +; SM20: { +; SM20-NEXT: .reg .b64 %rd<6>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [fshr64_high_imm_param_0]; +; SM20-NEXT: ld.param.u64 %rd2, [fshr64_high_imm_param_1]; +; SM20-NEXT: shr.u64 %rd3, %rd2, 33; +; SM20-NEXT: shl.b64 %rd4, %rd1, 31; +; SM20-NEXT: or.b64 %rd5, %rd4, %rd3; +; SM20-NEXT: st.param.b64 [func_retval0], %rd5; +; SM20-NEXT: ret; +; +; SM35-LABEL: fshr64_high_imm( +; SM35: { +; SM35-NEXT: .reg .b32 %r<7>; +; SM35-NEXT: .reg .b64 %rd<4>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [fshr64_high_imm_param_0]; +; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; +; SM35-NEXT: ld.param.u64 %rd2, [fshr64_high_imm_param_1]; +; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2; +; SM35-NEXT: shf.r.wrap.b32 %r5, %r4, %r1, 1; +; SM35-NEXT: shf.r.wrap.b32 %r6, %r1, %r2, 1; +; SM35-NEXT: mov.b64 %rd3, {%r5, %r6}; +; SM35-NEXT: st.param.b64 [func_retval0], %rd3; +; SM35-NEXT: ret; + %val = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 33) + ret i64 %val +} + +define i64 @fshr64_32_imm(i64 %a, i64 %b) { +; SM20-LABEL: fshr64_32_imm( +; SM20: { +; SM20-NEXT: .reg .b64 %rd<5>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [fshr64_32_imm_param_0]; +; SM20-NEXT: shl.b64 %rd2, %rd1, 32; +; SM20-NEXT: ld.param.u32 %rd3, [fshr64_32_imm_param_1+4]; +; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; +; SM20-NEXT: st.param.b64 [func_retval0], %rd4; +; SM20-NEXT: ret; +; +; SM35-LABEL: fshr64_32_imm( +; SM35: { +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<4>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [fshr64_32_imm_param_0]; +; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; +; SM35-NEXT: ld.param.u64 %rd2, [fshr64_32_imm_param_1]; +; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2; +; SM35-NEXT: mov.b64 %rd3, {%r4, %r1}; +; SM35-NEXT: st.param.b64 [func_retval0], %rd3; +; SM35-NEXT: ret; + %val = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 32) + ret i64 %val +} diff --git a/llvm/test/CodeGen/NVPTX/rotate_64.ll b/llvm/test/CodeGen/NVPTX/rotate_64.ll index aa0d8efc0c700..841dc67c68640 100644 --- a/llvm/test/CodeGen/NVPTX/rotate_64.ll +++ b/llvm/test/CodeGen/NVPTX/rotate_64.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 | %ptxas-verify %} declare i64 @llvm.nvvm.rotate.b64(i64, i32) @@ -8,14 +8,16 @@ declare i64 @llvm.nvvm.rotate.right.b64(i64, i32) define i64 @rotate64(i64 %a, i32 %b) { ; CHECK-LABEL: rotate64( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [rotate64_param_0]; -; CHECK-NEXT: shr.u64 %rd2, %rd1, 61; -; CHECK-NEXT: shl.b64 %rd3, %rd1, 3; -; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: shf.l.wrap.b32 %r3, %r1, %r2, 3; +; CHECK-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, 3; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; ; CHECK-NEXT: ret; %val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 3) ret i64 %val @@ -24,14 +26,16 @@ define i64 @rotate64(i64 %a, i32 %b) { define i64 @rotateright64(i64 %a, i32 %b) { ; CHECK-LABEL: rotateright64( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [rotateright64_param_0]; -; CHECK-NEXT: shl.b64 %rd2, %rd1, 61; -; CHECK-NEXT: shr.u64 %rd3, %rd1, 3; -; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: shf.r.wrap.b32 %r3, %r2, %r1, 3; +; CHECK-NEXT: shf.r.wrap.b32 %r4, %r1, %r2, 3; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; ; CHECK-NEXT: ret; %val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 3) ret i64 %val