diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index eb2ef6bc35742..3a4191a7d2963 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -18155,6 +18155,54 @@ Example: %r = call i8 @llvm.fshr.i8(i8 15, i8 15, i8 11) ; %r = i8: 225 (0b11100001) %r = call i8 @llvm.fshr.i8(i8 0, i8 255, i8 8) ; %r = i8: 255 (0b11111111) +.. clmul: + +'``clmul.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax +""""""" + +This is an overloaded intrinsic. You can use ``llvm.clmul`` +on any integer bit width or vectors of integers. + +:: + + declare i16 @llvm.clmul.i16(i16 %a, i16 %b) + declare i32 @llvm.clmul.i32(i32 %a, i32 %b) + declare i64 @llvm.clmul.i64(i64 %a, i64 %b) + declare <4 x i32> @llvm.clmul.v4i32(<4 x i32> %a, <4 x i32> %b) + +Overview +""""""""" + +The '``llvm.clmul``' family of intrinsic functions performs carryless multiplication +(also known as xor multiplication) on the 2 arguments. + +Arguments +"""""""""" + +The arguments (%a and %b) and the result may be of integer types of any bit +width, but they must have the same bit width. ``%a`` and ``%b`` are the two +values that will undergo carryless multiplication. + +Semantics: +"""""""""" + +The ‘llvm.clmul’ intrinsic computes carryless multiply of ``%a`` and ``%b``, which is the result +of applying the standard multiplication algorithm if you replace all of the additions with exclusive ors. +The vector intrinsics, such as llvm.clmul.v4i32, operate on a per-element basis and the element order is not affected. + +Examples +""""""""" + +.. code-block:: llvm + + %res = call i4 @llvm.clmul.i4(i4 1, i4 2) ; %res = 2 + %res = call i4 @llvm.clmul.i4(i4 5, i4 6) ; %res = 14 + %res = call i4 @llvm.clmul.i4(i4 -4, i4 2) ; %res = -8 + %res = call i4 @llvm.clmul.i4(i4 -4, i4 -5) ; %res = -12 + Arithmetic with Overflow Intrinsics ----------------------------------- diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 465e4a0a9d0d8..ffb71593af8bf 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -760,6 +760,9 @@ enum NodeType { ROTR, FSHL, FSHR, + + /// Carryless multiplication operator + CLMUL, /// Byte Swap and Counting operators. BSWAP, diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index cbdc1b6031680..b3fcb41400ed0 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5400,6 +5400,11 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase { /// \returns The expansion if successful, SDValue() otherwise SDValue expandFunnelShift(SDNode *N, SelectionDAG &DAG) const; + /// Expand carryless multiply. + /// \param N Node to expand + /// \returns The expansion if successful, SDValue() otherwise + SDValue expandCLMUL(SDNode *N, SelectionDAG &DAG) const; + /// Expand rotations. /// \param N Node to expand /// \param AllowVectorOps expand vector rotate, this should only be performed diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index bd6f94ac1286c..f5ba9e4cbcd89 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1427,6 +1427,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in { [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>; def int_fshr : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>; + def int_clmul : DefaultAttrsIntrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, LLVMMatchType<0>]>; } let IntrProperties = [IntrNoMem, IntrSpeculatable, diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index a4ed62bb5715c..c6fe89044a623 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -437,6 +437,8 @@ def sra_parts : SDNode<"ISD::SRA_PARTS" , SDTIntShiftPairOp>; def srl_parts : SDNode<"ISD::SRL_PARTS" , SDTIntShiftPairOp>; def fshl : SDNode<"ISD::FSHL" , SDTIntShiftDOp>; def fshr : SDNode<"ISD::FSHR" , SDTIntShiftDOp>; +def clmul : SDNode<"ISD::CLMUL" , SDTIntBinOp, + [SDNPCommutative, SDNPAssociative]>; def and : SDNode<"ISD::AND" , SDTIntBinOp, [SDNPCommutative, SDNPAssociative]>; def or : SDNode<"ISD::OR" , SDTIntBinOp, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index ba0ab2383d87a..597ca0e158032 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -4017,6 +4017,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { if (SDValue Expanded = TLI.expandFunnelShift(Node, DAG)) Results.push_back(Expanded); break; + case ISD::CLMUL: + Results.push_back(TLI.expandCLMUL(Node, DAG)); + break; case ISD::ROTL: case ISD::ROTR: if (SDValue Expanded = TLI.expandROT(Node, true /*AllowVectorOps*/, DAG)) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 583a85a332dcd..51ef777ca6ecc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -209,7 +209,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::VP_XOR: case ISD::VP_ADD: case ISD::VP_SUB: - case ISD::VP_MUL: Res = PromoteIntRes_SimpleIntBinOp(N); break; + case ISD::VP_MUL: + case ISD::CLMUL: Res = PromoteIntRes_SimpleIntBinOp(N); break; case ISD::ABDS: case ISD::AVGCEILS: @@ -3140,6 +3141,10 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { ExpandIntRes_FunnelShift(N, Lo, Hi); break; + case ISD::CLMUL: + ExpandIntRes_CLMUL(N, Lo, Hi); + break; + case ISD::VSCALE: ExpandIntRes_VSCALE(N, Lo, Hi); break; @@ -5476,6 +5481,37 @@ void DAGTypeLegalizer::ExpandIntRes_FunnelShift(SDNode *N, SDValue &Lo, Hi = DAG.getNode(Opc, DL, HalfVT, Select3, Select2, NewShAmt); } +void DAGTypeLegalizer::ExpandIntRes_CLMUL(SDNode *N, SDValue &Lo, + SDValue &Hi) { + // Values numbered from least significant to most significant. + SDValue LL, LH, RL, RH; + GetExpandedInteger(N->getOperand(0), LL, LH); + GetExpandedInteger(N->getOperand(1), RL, RH); + EVT HalfVT = LL.getValueType(); + SDLoc DL(N); + + // Lo is computed from the low half + Lo = DAG.getNode(ISD::CLMUL, DL, HalfVT, LL, RL); + // CLMUL is carryless so the high bits not included in CLMUL(A,B) + // can be computed by + // BITREVERSE(CLMUL(BITREVERSE(A), BITREVERSE(B))) >> 1 + // Therefore we can compute the 2 hi/lo cross products + // and the the overflow of the low product + // and xor them together to compute HI + // TODO: if the target supports a widening CLMUL or a CLMULH we should probably use that + SDValue BitRevLL = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, LL); + SDValue BitRevRL = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, RL); + SDValue BitRevLoHi = DAG.getNode(ISD::CLMUL, DL, HalfVT, BitRevLL, BitRevRL); + SDValue LoHi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, BitRevLoHi); + SDValue One = DAG.getShiftAmountConstant(1, HalfVT, DL); + Hi = DAG.getNode(ISD::SRL, DL, HalfVT, LoHi, One); + + SDValue HiTmp = DAG.getNode(ISD::CLMUL, DL, HalfVT, LL, RH); + Hi = DAG.getNode(ISD::XOR, DL, HalfVT, Hi, HiTmp); + HiTmp = DAG.getNode(ISD::CLMUL, DL, HalfVT, LH, RL); + Hi = DAG.getNode(ISD::XOR, DL, HalfVT, Hi, HiTmp); +} + void DAGTypeLegalizer::ExpandIntRes_VSCALE(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT VT = N->getValueType(0); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 2e13b1854bf29..8c9c557771ddb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -511,6 +511,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void ExpandIntRes_Rotate (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_FunnelShift (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandIntRes_CLMUL (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_VSCALE (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_READ_REGISTER(SDNode *N, SDValue &Lo, SDValue &Hi); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 1661814d5a897..a2c3c5b74b8de 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -166,6 +166,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::SMAX: case ISD::UMIN: case ISD::UMAX: + case ISD::CLMUL: case ISD::SADDSAT: case ISD::UADDSAT: @@ -1330,6 +1331,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::SMAX: case ISD::VP_SMAX: case ISD::UMIN: case ISD::VP_UMIN: case ISD::UMAX: case ISD::VP_UMAX: + case ISD::CLMUL: case ISD::SADDSAT: case ISD::VP_SADDSAT: case ISD::UADDSAT: case ISD::VP_UADDSAT: case ISD::SSUBSAT: case ISD::VP_SSUBSAT: @@ -4764,6 +4766,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::SSUBSAT: case ISD::VP_SSUBSAT: case ISD::SSHLSAT: case ISD::USHLSAT: + case ISD::CLMUL: case ISD::ROTL: case ISD::ROTR: case ISD::AVGFLOORS: diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 5c586f73aa125..207c5398f8d61 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7521,6 +7521,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::SSUBSAT: case ISD::UADDSAT: case ISD::USUBSAT: + case ISD::CLMUL: assert(VT.isInteger() && "This operator does not apply to FP types!"); assert(N1.getValueType() == N2.getValueType() && N1.getValueType() == VT && "Binary operator types must match!"); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 306e068f1c1da..f37191dfcc857 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7228,6 +7228,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, } return; } + case Intrinsic::clmul: { + SDValue Op1 = getValue(I.getArgOperand(0)); + SDValue Op2 = getValue(I.getArgOperand(1)); + setValue(&I, DAG.getNode(ISD::CLMUL, sdl, Op1.getValueType(), Op1, Op2)); + return; + } case Intrinsic::sadd_sat: { SDValue Op1 = getValue(I.getArgOperand(0)); SDValue Op2 = getValue(I.getArgOperand(1)); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 900da7645504f..68550807e731b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -299,6 +299,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::ROTR: return "rotr"; case ISD::FSHL: return "fshl"; case ISD::FSHR: return "fshr"; + case ISD::CLMUL: return "clmul"; case ISD::FADD: return "fadd"; case ISD::STRICT_FADD: return "strict_fadd"; case ISD::FSUB: return "fsub"; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 1764910861df4..f2106f43e443d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8277,6 +8277,44 @@ SDValue TargetLowering::expandFunnelShift(SDNode *Node, return DAG.getNode(ISD::OR, DL, VT, ShX, ShY); } +SDValue TargetLowering::expandCLMUL(SDNode *Node, + SelectionDAG &DAG) const { + SDLoc DL(Node); + EVT VT = Node->getValueType(0); + SDValue V1 = Node->getOperand(0); + SDValue V2 = Node->getOperand(1); + unsigned NumBitsPerElt = VT.getScalarSizeInBits(); + + EVT SetCCType = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + // Only expand vector types if we have the appropriate vector bit operations. + // FIXME: Should really try to split the vector in case it's legal on a + // subvector. + if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) || + (!isOperationLegalOrCustom(ISD::SRL, VT) || + !isOperationLegalOrCustom(ISD::SHL, VT) || + !isOperationLegalOrCustom(ISD::XOR, VT) || + !isOperationLegalOrCustom(ISD::AND, VT) || + !isOperationLegalOrCustom(ISD::SELECT, VT)))) + return DAG.UnrollVectorOp(Node); + + SDValue Res = DAG.getConstant(0, DL, VT); + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue One = DAG.getConstant(1, DL, VT); + SDValue OneForShift = DAG.getShiftAmountConstant(1, VT, DL); + for (unsigned I = 0; I < NumBitsPerElt; ++I) { + SDValue LowBit = DAG.getNode(ISD::AND, DL, VT, V1, One); + SDValue LowBool = DAG.getSetCC(DL, SetCCType, LowBit, Zero, ISD::SETNE); + SDValue Pred = DAG.getNode(ISD::SELECT, DL, VT, LowBool, V2, Zero); + Res = DAG.getNode(ISD::XOR, DL, VT, Res, Pred); + if (I != NumBitsPerElt - 1) { + V1 = DAG.getNode(ISD::SRL, DL, VT, V1, OneForShift); + V2 = DAG.getNode(ISD::SHL, DL, VT, V2, OneForShift); + } + } + return Res; +} + // TODO: Merge with expandFunnelShift. SDValue TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps, SelectionDAG &DAG) const { diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 3c91b0eb4e2ea..725fa9cacdd04 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -844,6 +844,9 @@ void TargetLoweringBase::initActions() { // Absolute difference setOperationAction({ISD::ABDS, ISD::ABDU}, VT, Expand); + // Carryless multiply + setOperationAction(ISD::CLMUL, VT, Expand); + // Saturated trunc setOperationAction(ISD::TRUNCATE_SSAT_S, VT, Expand); setOperationAction(ISD::TRUNCATE_SSAT_U, VT, Expand); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index dbe992208c9f3..9af4f3adbffa9 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -402,6 +402,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, Legal); } + if (Subtarget.hasStdExtZbc() || Subtarget.hasStdExtZbkc()) { + setOperationAction(ISD::CLMUL, XLenVT, Legal); + } + if (Subtarget.hasStdExtZbb() || (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit())) { if (Subtarget.is64Bit()) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index d2a651444169c..666ae032228aa 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -799,6 +799,7 @@ def : Sh3AddPat; } // Predicates = [HasStdExtZba, IsRV64] let Predicates = [HasStdExtZbcOrZbkc] in { +def : PatGprGpr; def : PatGprGpr; def : PatGprGpr; } // Predicates = [HasStdExtZbcOrZbkc] diff --git a/llvm/test/CodeGen/Generic/clmul-expand.ll b/llvm/test/CodeGen/Generic/clmul-expand.ll new file mode 100644 index 0000000000000..21aeaf9195a1f --- /dev/null +++ b/llvm/test/CodeGen/Generic/clmul-expand.ll @@ -0,0 +1,190 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=-zbc,-zbkc -verify-machineinstrs | FileCheck %s --check-prefix=RV32-EXPAND +; RUN: llc < %s -mtriple=riscv64 -mattr=-zbc,-zbkc -verify-machineinstrs | FileCheck %s --check-prefix=RV64-EXPAND +; RUN: llc < %s -mtriple=x86_64 -verify-machineinstrs | FileCheck %s --check-prefix=X64-EXPAND + +; Test CLMUL expansion when the instruction is not natively supported + +declare i8 @llvm.clmul.i8(i8 %a, i8 %b) +declare i16 @llvm.clmul.i16(i16 %a, i16 %b) +declare i32 @llvm.clmul.i32(i32 %a, i32 %b) +declare i64 @llvm.clmul.i64(i64 %a, i64 %b) +declare i128 @llvm.clmul.i128(i128 %a, i128 %b) + +define i8 @clmul_expand_i8(i8 %a, i8 %b) nounwind { +; RV32-EXPAND-LABEL: clmul_expand_i8: +; RV32-EXPAND: # %bb.0: +; RV32-EXPAND-NEXT: andi a2, a0, 1 +; RV32-EXPAND-NEXT: beqz a2, .LBB0_2 +; RV32-EXPAND-NEXT: # %bb.1: +; RV32-EXPAND-NEXT: mv a2, a1 +; RV32-EXPAND-NEXT: j .LBB0_3 +; RV32-EXPAND-NEXT: .LBB0_2: +; RV32-EXPAND-NEXT: li a2, 0 +; RV32-EXPAND-NEXT: .LBB0_3: +; RV32-EXPAND-NEXT: srli a0, a0, 1 +; RV32-EXPAND-NEXT: slli a1, a1, 1 +; RV32-EXPAND-NEXT: andi a3, a0, 1 +; RV32-EXPAND-NEXT: beqz a3, .LBB0_5 +; RV32-EXPAND-NEXT: # %bb.4: +; RV32-EXPAND-NEXT: xor a2, a2, a1 +; RV32-EXPAND-NEXT: .LBB0_5: +; RV32-EXPAND-NEXT: srli a0, a0, 1 +; RV32-EXPAND-NEXT: slli a1, a1, 1 +; RV32-EXPAND-NEXT: andi a3, a0, 1 +; RV32-EXPAND-NEXT: beqz a3, .LBB0_7 +; RV32-EXPAND-NEXT: # %bb.6: +; RV32-EXPAND-NEXT: xor a2, a2, a1 +; RV32-EXPAND-NEXT: .LBB0_7: +; RV32-EXPAND-NEXT: srli a0, a0, 1 +; RV32-EXPAND-NEXT: slli a1, a1, 1 +; RV32-EXPAND-NEXT: andi a3, a0, 1 +; RV32-EXPAND-NEXT: beqz a3, .LBB0_9 +; RV32-EXPAND-NEXT: # %bb.8: +; RV32-EXPAND-NEXT: xor a2, a2, a1 +; RV32-EXPAND-NEXT: .LBB0_9: +; RV32-EXPAND-NEXT: srli a0, a0, 1 +; RV32-EXPAND-NEXT: slli a1, a1, 1 +; RV32-EXPAND-NEXT: andi a3, a0, 1 +; RV32-EXPAND-NEXT: beqz a3, .LBB0_11 +; RV32-EXPAND-NEXT: # %bb.10: +; RV32-EXPAND-NEXT: xor a2, a2, a1 +; RV32-EXPAND-NEXT: .LBB0_11: +; RV32-EXPAND-NEXT: srli a0, a0, 1 +; RV32-EXPAND-NEXT: slli a1, a1, 1 +; RV32-EXPAND-NEXT: andi a3, a0, 1 +; RV32-EXPAND-NEXT: beqz a3, .LBB0_13 +; RV32-EXPAND-NEXT: # %bb.12: +; RV32-EXPAND-NEXT: xor a2, a2, a1 +; RV32-EXPAND-NEXT: .LBB0_13: +; RV32-EXPAND-NEXT: srli a0, a0, 1 +; RV32-EXPAND-NEXT: slli a1, a1, 1 +; RV32-EXPAND-NEXT: andi a3, a0, 1 +; RV32-EXPAND-NEXT: beqz a3, .LBB0_15 +; RV32-EXPAND-NEXT: # %bb.14: +; RV32-EXPAND-NEXT: xor a2, a2, a1 +; RV32-EXPAND-NEXT: .LBB0_15: +; RV32-EXPAND-NEXT: andi a0, a0, 2 +; RV32-EXPAND-NEXT: beqz a0, .LBB0_17 +; RV32-EXPAND-NEXT: # %bb.16: +; RV32-EXPAND-NEXT: slli a0, a1, 1 +; RV32-EXPAND-NEXT: xor a2, a2, a0 +; RV32-EXPAND-NEXT: .LBB0_17: +; RV32-EXPAND-NEXT: mv a0, a2 +; RV32-EXPAND-NEXT: ret + %result = call i8 @llvm.clmul.i8(i8 %a, i8 %b) + ret i8 %result +} + +define i16 @clmul_expand_i16(i16 %a, i16 %b) nounwind { +; RV32-EXPAND-LABEL: clmul_expand_i16: +; RV32-EXPAND: # %bb.0: +; RV32-EXPAND-NEXT: andi a2, a0, 1 +; RV32-EXPAND-NEXT: beqz a2, .LBB1_2 +; RV32-EXPAND-NEXT: # %bb.1: +; RV32-EXPAND-NEXT: mv a2, a1 +; RV32-EXPAND-NEXT: j .LBB1_3 +; RV32-EXPAND-NEXT: .LBB1_2: +; RV32-EXPAND-NEXT: li a2, 0 +; RV32-EXPAND-NEXT: .LBB1_3: +; RV32-EXPAND-NEXT: srli a0, a0, 1 +; RV32-EXPAND-NEXT: slli a1, a1, 1 + %result = call i16 @llvm.clmul.i16(i16 %a, i16 %b) + ret i16 %result +} + +define i32 @clmul_expand_i32(i32 %a, i32 %b) nounwind { +; RV32-EXPAND-LABEL: clmul_expand_i32: +; RV32-EXPAND: # %bb.0: +; RV32-EXPAND-NEXT: andi a2, a0, 1 +; RV32-EXPAND-NEXT: beqz a2, .LBB2_2 +; RV32-EXPAND-NEXT: # %bb.1: +; RV32-EXPAND-NEXT: mv a2, a1 +; RV32-EXPAND-NEXT: j .LBB2_3 +; RV32-EXPAND-NEXT: .LBB2_2: +; RV32-EXPAND-NEXT: li a2, 0 +; RV32-EXPAND-NEXT: .LBB2_3: +; RV32-EXPAND-NEXT: srli a0, a0, 1 +; RV32-EXPAND-NEXT: slli a1, a1, 1 + %result = call i32 @llvm.clmul.i32(i32 %a, i32 %b) + ret i32 %result +} + +define i64 @clmul_expand_i64(i64 %a, i64 %b) nounwind { +; RV32-EXPAND-LABEL: clmul_expand_i64: +; RV32-EXPAND: # %bb.0: +; RV32-EXPAND-NEXT: addi sp, sp, -16 +; RV32-EXPAND-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-EXPAND-NEXT: call __clmuldi3 +; RV32-EXPAND-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-EXPAND-NEXT: addi sp, sp, 16 +; RV32-EXPAND-NEXT: ret +; +; RV64-EXPAND-LABEL: clmul_expand_i64: +; RV64-EXPAND: # %bb.0: +; RV64-EXPAND-NEXT: andi a2, a0, 1 +; RV64-EXPAND-NEXT: beqz a2, .LBB3_2 +; RV64-EXPAND-NEXT: # %bb.1: +; RV64-EXPAND-NEXT: mv a2, a1 +; RV64-EXPAND-NEXT: j .LBB3_3 +; RV64-EXPAND-NEXT: .LBB3_2: +; RV64-EXPAND-NEXT: li a2, 0 +; RV64-EXPAND-NEXT: .LBB3_3: +; RV64-EXPAND-NEXT: srli a0, a0, 1 +; RV64-EXPAND-NEXT: slli a1, a1, 1 + %result = call i64 @llvm.clmul.i64(i64 %a, i64 %b) + ret i64 %result +} + +define i128 @clmul_expand_i128(i128 %a, i128 %b) nounwind { +; RV32-EXPAND-LABEL: clmul_expand_i128: +; RV32-EXPAND: # %bb.0: +; RV32-EXPAND-NEXT: addi sp, sp, -16 +; RV32-EXPAND-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-EXPAND-NEXT: call __clmulti3 +; RV32-EXPAND-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-EXPAND-NEXT: addi sp, sp, 16 +; RV32-EXPAND-NEXT: ret +; +; RV64-EXPAND-LABEL: clmul_expand_i128: +; RV64-EXPAND: # %bb.0: +; RV64-EXPAND-NEXT: addi sp, sp, -16 +; RV64-EXPAND-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-EXPAND-NEXT: call __clmulti3 +; RV64-EXPAND-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-EXPAND-NEXT: addi sp, sp, 16 +; RV64-EXPAND-NEXT: ret + %result = call i128 @llvm.clmul.i128(i128 %a, i128 %b) + ret i128 %result +} + +; Test with known constants to verify correctness +define i8 @clmul_expand_i8_known() nounwind { +; RV32-EXPAND-LABEL: clmul_expand_i8_known: +; RV32-EXPAND: # %bb.0: +; RV32-EXPAND-NEXT: li a0, 14 +; RV32-EXPAND-NEXT: ret +; +; RV64-EXPAND-LABEL: clmul_expand_i8_known: +; RV64-EXPAND: # %bb.0: +; RV64-EXPAND-NEXT: li a0, 14 +; RV64-EXPAND-NEXT: ret + ; clmul(5, 6) = clmul(0b101, 0b110) = 0b1110 = 14 + %result = call i8 @llvm.clmul.i8(i8 5, i8 6) + ret i8 %result +} + +define i16 @clmul_expand_i16_known() nounwind { +; RV32-EXPAND-LABEL: clmul_expand_i16_known: +; RV32-EXPAND: # %bb.0: +; RV32-EXPAND-NEXT: li a0, 158 +; RV32-EXPAND-NEXT: ret +; +; RV64-EXPAND-LABEL: clmul_expand_i16_known: +; RV64-EXPAND: # %bb.0: +; RV64-EXPAND-NEXT: li a0, 158 +; RV64-EXPAND-NEXT: ret + ; clmul(15, 13) = clmul(0b1111, 0b1101) = 0b10011110 = 158 + %result = call i16 @llvm.clmul.i16(i16 15, i16 13) + ret i16 %result +} \ No newline at end of file diff --git a/llvm/test/CodeGen/Generic/clmul-vector.ll b/llvm/test/CodeGen/Generic/clmul-vector.ll new file mode 100644 index 0000000000000..2bcf77f32f68c --- /dev/null +++ b/llvm/test/CodeGen/Generic/clmul-vector.ll @@ -0,0 +1,178 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=-zbc,-zbkc -verify-machineinstrs | FileCheck %s --check-prefix=RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=-zbc,-zbkc -verify-machineinstrs | FileCheck %s --check-prefix=RV64 +; RUN: llc < %s -mtriple=x86_64 -verify-machineinstrs | FileCheck %s --check-prefix=X64 + +; Test CLMUL with vector types + +declare <2 x i32> @llvm.clmul.v2i32(<2 x i32> %a, <2 x i32> %b) +declare <4 x i32> @llvm.clmul.v4i32(<4 x i32> %a, <4 x i32> %b) +declare <2 x i64> @llvm.clmul.v2i64(<2 x i64> %a, <2 x i64> %b) +declare <8 x i16> @llvm.clmul.v8i16(<8 x i16> %a, <8 x i16> %b) +declare <16 x i8> @llvm.clmul.v16i8(<16 x i8> %a, <16 x i8> %b) + +define <2 x i32> @clmul_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { +; RV32-LABEL: clmul_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: sw s0, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 8(sp) # 4-byte Folded Spill +; RV32-NEXT: mv s0, a3 +; RV32-NEXT: mv s1, a2 +; RV32-NEXT: mv s2, a1 +; RV32-NEXT: mv s3, a0 +; RV32-NEXT: andi a0, a3, 1 +; RV32-NEXT: beqz a0, .LBB0_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv s4, s1 +; RV32-NEXT: j .LBB0_3 +; RV32-NEXT: .LBB0_2: +; RV32-NEXT: li s4, 0 +; RV32-NEXT: .LBB0_3: +; RV32-NEXT: srli s0, s0, 1 +; RV32-NEXT: slli s1, s1, 1 + %result = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> %a, <2 x i32> %b) + ret <2 x i32> %result +} + +define <4 x i32> @clmul_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { +; RV32-LABEL: clmul_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -80 +; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 68(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 64(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 60(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 56(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 52(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 48(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s7, 44(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s8, 40(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s9, 36(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s10, 32(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s11, 28(sp) # 4-byte Folded Spill + %result = call <4 x i32> @llvm.clmul.v4i32(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +define <2 x i64> @clmul_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { +; RV32-LABEL: clmul_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 36(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 32(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s7, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s8, 8(sp) # 4-byte Folded Spill +; RV32-NEXT: lw s0, 64(sp) +; RV32-NEXT: lw s1, 68(sp) +; RV32-NEXT: lw s2, 72(sp) +; RV32-NEXT: lw s3, 76(sp) +; RV32-NEXT: mv s4, a0 +; RV32-NEXT: mv s5, a1 +; RV32-NEXT: mv s6, a2 +; RV32-NEXT: mv s7, a3 +; RV32-NEXT: mv a0, s6 +; RV32-NEXT: mv a1, s7 +; RV32-NEXT: mv a2, s2 +; RV32-NEXT: mv a3, s3 +; RV32-NEXT: call __clmuldi3 +; RV32-NEXT: mv s8, a0 +; RV32-NEXT: mv s2, a1 +; RV32-NEXT: mv a0, s4 +; RV32-NEXT: mv a1, s5 +; RV32-NEXT: mv a2, s0 +; RV32-NEXT: mv a3, s1 +; RV32-NEXT: call __clmuldi3 +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: mv a3, a1 +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: mv a1, a3 +; RV32-NEXT: mv a2, s8 +; RV32-NEXT: mv a3, s2 +; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 36(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s6, 16(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s7, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s8, 8(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: ret + %result = call <2 x i64> @llvm.clmul.v2i64(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %result +} + +define <8 x i16> @clmul_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { +; RV32-LABEL: clmul_v8i16: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -144 +; RV32-NEXT: sw ra, 140(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 136(sp) # 4-byte Folded Spill + %result = call <8 x i16> @llvm.clmul.v8i16(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %result +} + +define <16 x i8> @clmul_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { +; RV32-LABEL: clmul_v16i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -272 +; RV32-NEXT: sw ra, 268(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 264(sp) # 4-byte Folded Spill + %result = call <16 x i8> @llvm.clmul.v16i8(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %result +} + +; Test with splat vectors +define <2 x i32> @clmul_v2i32_splat(<2 x i32> %a) nounwind { +; RV32-LABEL: clmul_v2i32_splat: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: sw s0, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 8(sp) # 4-byte Folded Spill +; RV32-NEXT: mv s0, a1 +; RV32-NEXT: mv s1, a0 +; RV32-NEXT: andi a0, a1, 1 +; RV32-NEXT: li s2, 3 +; RV32-NEXT: beqz a0, .LBB5_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv s3, s2 +; RV32-NEXT: j .LBB5_3 +; RV32-NEXT: .LBB5_2: +; RV32-NEXT: li s3, 0 +; RV32-NEXT: .LBB5_3: +; RV32-NEXT: srli s0, s0, 1 +; RV32-NEXT: slli s2, s2, 1 + %splat = insertelement <2 x i32> poison, i32 3, i32 0 + %splat_vec = shufflevector <2 x i32> %splat, <2 x i32> poison, <2 x i32> zeroinitializer + %result = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> %a, <2 x i32> %splat_vec) + ret <2 x i32> %result +} + +; Test with constant vectors +define <2 x i32> @clmul_v2i32_const() nounwind { +; RV32-LABEL: clmul_v2i32_const: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 14 +; RV32-NEXT: li a1, 6 +; RV32-NEXT: ret + %result = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> , <2 x i32> ) + ret <2 x i32> %result +} \ No newline at end of file diff --git a/llvm/test/CodeGen/RISCV/clmul-intrinsic.ll b/llvm/test/CodeGen/RISCV/clmul-intrinsic.ll new file mode 100644 index 0000000000000..291509c343efe --- /dev/null +++ b/llvm/test/CodeGen/RISCV/clmul-intrinsic.ll @@ -0,0 +1,166 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+zbc -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefix=RV32ZBC +; RUN: llc -mtriple=riscv32 -mattr=+zbkc -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefix=RV32ZBC +; RUN: llc -mtriple=riscv64 -mattr=+zbc -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefix=RV64ZBC +; RUN: llc -mtriple=riscv64 -mattr=+zbkc -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefix=RV64ZBC + +; Test generic llvm.clmul intrinsic + +declare i8 @llvm.clmul.i8(i8 %a, i8 %b) +declare i16 @llvm.clmul.i16(i16 %a, i16 %b) +declare i32 @llvm.clmul.i32(i32 %a, i32 %b) +declare i64 @llvm.clmul.i64(i64 %a, i64 %b) + +define i8 @clmul_i8(i8 %a, i8 %b) nounwind { +; RV32ZBC-LABEL: clmul_i8: +; RV32ZBC: # %bb.0: +; RV32ZBC-NEXT: clmul a0, a0, a1 +; RV32ZBC-NEXT: ret +; +; RV64ZBC-LABEL: clmul_i8: +; RV64ZBC: # %bb.0: +; RV64ZBC-NEXT: clmul a0, a0, a1 +; RV64ZBC-NEXT: ret + %result = call i8 @llvm.clmul.i8(i8 %a, i8 %b) + ret i8 %result +} + +define i16 @clmul_i16(i16 %a, i16 %b) nounwind { +; RV32ZBC-LABEL: clmul_i16: +; RV32ZBC: # %bb.0: +; RV32ZBC-NEXT: clmul a0, a0, a1 +; RV32ZBC-NEXT: ret +; +; RV64ZBC-LABEL: clmul_i16: +; RV64ZBC: # %bb.0: +; RV64ZBC-NEXT: clmul a0, a0, a1 +; RV64ZBC-NEXT: ret + %result = call i16 @llvm.clmul.i16(i16 %a, i16 %b) + ret i16 %result +} + +define i32 @clmul_i32(i32 %a, i32 %b) nounwind { +; RV32ZBC-LABEL: clmul_i32: +; RV32ZBC: # %bb.0: +; RV32ZBC-NEXT: clmul a0, a0, a1 +; RV32ZBC-NEXT: ret +; +; RV64ZBC-LABEL: clmul_i32: +; RV64ZBC: # %bb.0: +; RV64ZBC-NEXT: clmul a0, a0, a1 +; RV64ZBC-NEXT: ret + %result = call i32 @llvm.clmul.i32(i32 %a, i32 %b) + ret i32 %result +} + +define i64 @clmul_i64_rv32(i64 %a, i64 %b) nounwind { +; RV32ZBC-LABEL: clmul_i64_rv32: +; RV32ZBC: # %bb.0: +; RV32ZBC-NEXT: addi sp, sp, -16 +; RV32ZBC-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32ZBC-NEXT: call __clmuldi3 +; RV32ZBC-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32ZBC-NEXT: addi sp, sp, 16 +; RV32ZBC-NEXT: ret + %result = call i64 @llvm.clmul.i64(i64 %a, i64 %b) + ret i64 %result +} + +define i64 @clmul_i64_rv64(i64 %a, i64 %b) nounwind { +; RV64ZBC-LABEL: clmul_i64_rv64: +; RV64ZBC: # %bb.0: +; RV64ZBC-NEXT: clmul a0, a0, a1 +; RV64ZBC-NEXT: ret + %result = call i64 @llvm.clmul.i64(i64 %a, i64 %b) + ret i64 %result +} + +; Test with constants +define i32 @clmul_i32_const() nounwind { +; RV32ZBC-LABEL: clmul_i32_const: +; RV32ZBC: # %bb.0: +; RV32ZBC-NEXT: li a0, 14 +; RV32ZBC-NEXT: ret +; +; RV64ZBC-LABEL: clmul_i32_const: +; RV64ZBC: # %bb.0: +; RV64ZBC-NEXT: li a0, 14 +; RV64ZBC-NEXT: ret + %result = call i32 @llvm.clmul.i32(i32 5, i32 6) + ret i32 %result +} + +; Test with zero operands +define i32 @clmul_i32_zero_left(i32 %b) nounwind { +; RV32ZBC-LABEL: clmul_i32_zero_left: +; RV32ZBC: # %bb.0: +; RV32ZBC-NEXT: li a0, 0 +; RV32ZBC-NEXT: ret +; +; RV64ZBC-LABEL: clmul_i32_zero_left: +; RV64ZBC: # %bb.0: +; RV64ZBC-NEXT: li a0, 0 +; RV64ZBC-NEXT: ret + %result = call i32 @llvm.clmul.i32(i32 0, i32 %b) + ret i32 %result +} + +define i32 @clmul_i32_zero_right(i32 %a) nounwind { +; RV32ZBC-LABEL: clmul_i32_zero_right: +; RV32ZBC: # %bb.0: +; RV32ZBC-NEXT: li a0, 0 +; RV32ZBC-NEXT: ret +; +; RV64ZBC-LABEL: clmul_i32_zero_right: +; RV64ZBC: # %bb.0: +; RV64ZBC-NEXT: li a0, 0 +; RV64ZBC-NEXT: ret + %result = call i32 @llvm.clmul.i32(i32 %a, i32 0) + ret i32 %result +} + +; Test with identity (multiplying by 1) +define i32 @clmul_i32_identity_left(i32 %a) nounwind { +; RV32ZBC-LABEL: clmul_i32_identity_left: +; RV32ZBC: # %bb.0: +; RV32ZBC-NEXT: ret +; +; RV64ZBC-LABEL: clmul_i32_identity_left: +; RV64ZBC: # %bb.0: +; RV64ZBC-NEXT: ret + %result = call i32 @llvm.clmul.i32(i32 1, i32 %a) + ret i32 %result +} + +define i32 @clmul_i32_identity_right(i32 %a) nounwind { +; RV32ZBC-LABEL: clmul_i32_identity_right: +; RV32ZBC: # %bb.0: +; RV32ZBC-NEXT: ret +; +; RV64ZBC-LABEL: clmul_i32_identity_right: +; RV64ZBC: # %bb.0: +; RV64ZBC-NEXT: ret + %result = call i32 @llvm.clmul.i32(i32 %a, i32 1) + ret i32 %result +} + +; Test commutativity +define i32 @clmul_i32_commutative(i32 %a, i32 %b) nounwind { +; RV32ZBC-LABEL: clmul_i32_commutative: +; RV32ZBC: # %bb.0: +; RV32ZBC-NEXT: clmul a0, a0, a1 +; RV32ZBC-NEXT: ret +; +; RV64ZBC-LABEL: clmul_i32_commutative: +; RV64ZBC: # %bb.0: +; RV64ZBC-NEXT: clmul a0, a0, a1 +; RV64ZBC-NEXT: ret + %result1 = call i32 @llvm.clmul.i32(i32 %a, i32 %b) + %result2 = call i32 @llvm.clmul.i32(i32 %b, i32 %a) + %xor = xor i32 %result1, %result2 + ret i32 %xor +} \ No newline at end of file diff --git a/llvm/test/CodeGen/RISCV/rv32zbc-zbkc-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv32zbc-zbkc-intrinsic.ll index 51fd086e26dfe..cb190f8ee90c2 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbc-zbkc-intrinsic.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbc-zbkc-intrinsic.ll @@ -15,6 +15,17 @@ define i32 @clmul32(i32 %a, i32 %b) nounwind { ret i32 %tmp } +declare i32 @llvm.clmul.i32(i32 %a, i32 %b) + +define i32 @generic_clmul32(i32 %a, i32 %b) nounwind { +; RV32ZBC-ZBKC-LABEL: generic_clmul32: +; RV32ZBC-ZBKC: # %bb.0: +; RV32ZBC-ZBKC-NEXT: clmul a0, a0, a1 +; RV32ZBC-ZBKC-NEXT: ret + %tmp = call i32 @llvm.clmul.i32(i32 %a, i32 %b) + ret i32 %tmp +} + declare i32 @llvm.riscv.clmulh.i32(i32 %a, i32 %b) define i32 @clmul32h(i32 %a, i32 %b) nounwind { diff --git a/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll index aa9e89bc20953..7a535e93791cb 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll @@ -15,6 +15,17 @@ define i64 @clmul64(i64 %a, i64 %b) nounwind { ret i64 %tmp } +declare i64 @llvm.clmul.i64(i64 %a, i64 %b) + +define i64 @generic_clmul64(i64 %a, i64 %b) nounwind { +; RV64ZBC-ZBKC-LABEL: generic_clmul64: +; RV64ZBC-ZBKC: # %bb.0: +; RV64ZBC-ZBKC-NEXT: clmul a0, a0, a1 +; RV64ZBC-ZBKC-NEXT: ret + %tmp = call i64 @llvm.clmul.i64(i64 %a, i64 %b) + ret i64 %tmp +} + declare i64 @llvm.riscv.clmulh.i64(i64 %a, i64 %b) define i64 @clmul64h(i64 %a, i64 %b) nounwind { @@ -37,6 +48,17 @@ define signext i32 @clmul32(i32 signext %a, i32 signext %b) nounwind { %tmp = call i32 @llvm.riscv.clmul.i32(i32 %a, i32 %b) ret i32 %tmp } +declare i32 @llvm.clmul.i32(i32 %a, i32 %b) + +define signext i32 @generic_clmul32(i32 signext %a, i32 signext %b) nounwind { +; RV64ZBC-ZBKC-LABEL: generic_clmul32: +; RV64ZBC-ZBKC: # %bb.0: +; RV64ZBC-ZBKC-NEXT: clmul a0, a0, a1 +; RV64ZBC-ZBKC-NEXT: sext.w a0, a0 +; RV64ZBC-ZBKC-NEXT: ret + %tmp = call i32 @llvm.clmul.i32(i32 %a, i32 %b) + ret i32 %tmp +} declare i32 @llvm.riscv.clmulh.i32(i32 %a, i32 %b) diff --git a/llvm/test/Transforms/InstCombine/clmul.ll b/llvm/test/Transforms/InstCombine/clmul.ll new file mode 100644 index 0000000000000..cdfe0e93d1e31 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/clmul.ll @@ -0,0 +1,175 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +declare i8 @llvm.clmul.i8(i8 %a, i8 %b) +declare i16 @llvm.clmul.i16(i16 %a, i16 %b) +declare i32 @llvm.clmul.i32(i32 %a, i32 %b) +declare i64 @llvm.clmul.i64(i64 %a, i64 %b) +declare <2 x i32> @llvm.clmul.v2i32(<2 x i32> %a, <2 x i32> %b) +declare <4 x i32> @llvm.clmul.v4i32(<4 x i32> %a, <4 x i32> %b) + +; Test constant folding + +define i8 @clmul_i8_const() { +; CHECK-LABEL: @clmul_i8_const( +; CHECK-NEXT: ret i8 14 +; + %result = call i8 @llvm.clmul.i8(i8 5, i8 6) + ret i8 %result +} + +define i16 @clmul_i16_const() { +; CHECK-LABEL: @clmul_i16_const( +; CHECK-NEXT: ret i16 158 +; + ; clmul(15, 13) = clmul(0b1111, 0b1101) = 0b10011110 = 158 + %result = call i16 @llvm.clmul.i16(i16 15, i16 13) + ret i16 %result +} + +define i32 @clmul_i32_const() { +; CHECK-LABEL: @clmul_i32_const( +; CHECK-NEXT: ret i32 14 +; + %result = call i32 @llvm.clmul.i32(i32 5, i32 6) + ret i32 %result +} + +define i64 @clmul_i64_const() { +; CHECK-LABEL: @clmul_i64_const( +; CHECK-NEXT: ret i64 398 +; + ; clmul(31, 17) = clmul(0b11111, 0b10001) = 0b110001110 = 398 + %result = call i64 @llvm.clmul.i64(i64 31, i64 17) + ret i64 %result +} + +; Test zero operands + +define i32 @clmul_zero_left(i32 %a) { +; CHECK-LABEL: @clmul_zero_left( +; CHECK-NEXT: ret i32 0 +; + %result = call i32 @llvm.clmul.i32(i32 0, i32 %a) + ret i32 %result +} + +define i32 @clmul_zero_right(i32 %a) { +; CHECK-LABEL: @clmul_zero_right( +; CHECK-NEXT: ret i32 0 +; + %result = call i32 @llvm.clmul.i32(i32 %a, i32 0) + ret i32 %result +} + +; Test identity (multiply by 1) + +define i32 @clmul_identity_left(i32 %a) { +; CHECK-LABEL: @clmul_identity_left( +; CHECK-NEXT: ret i32 [[A:%.*]] +; + %result = call i32 @llvm.clmul.i32(i32 1, i32 %a) + ret i32 %result +} + +define i32 @clmul_identity_right(i32 %a) { +; CHECK-LABEL: @clmul_identity_right( +; CHECK-NEXT: ret i32 [[A:%.*]] +; + %result = call i32 @llvm.clmul.i32(i32 %a, i32 1) + ret i32 %result +} + +; Test with vector constants + +define <2 x i32> @clmul_v2i32_const() { +; CHECK-LABEL: @clmul_v2i32_const( +; CHECK-NEXT: ret <2 x i32> +; + %result = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> , <2 x i32> ) + ret <2 x i32> %result +} + +define <2 x i32> @clmul_v2i32_zero_left(<2 x i32> %a) { +; CHECK-LABEL: @clmul_v2i32_zero_left( +; CHECK-NEXT: ret <2 x i32> zeroinitializer +; + %result = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> zeroinitializer, <2 x i32> %a) + ret <2 x i32> %result +} + +define <2 x i32> @clmul_v2i32_zero_right(<2 x i32> %a) { +; CHECK-LABEL: @clmul_v2i32_zero_right( +; CHECK-NEXT: ret <2 x i32> zeroinitializer +; + %result = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> %a, <2 x i32> zeroinitializer) + ret <2 x i32> %result +} + +define <2 x i32> @clmul_v2i32_identity_left(<2 x i32> %a) { +; CHECK-LABEL: @clmul_v2i32_identity_left( +; CHECK-NEXT: ret <2 x i32> [[A:%.*]] +; + %result = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> , <2 x i32> %a) + ret <2 x i32> %result +} + +define <2 x i32> @clmul_v2i32_identity_right(<2 x i32> %a) { +; CHECK-LABEL: @clmul_v2i32_identity_right( +; CHECK-NEXT: ret <2 x i32> [[A:%.*]] +; + %result = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> %a, <2 x i32> ) + ret <2 x i32> %result +} + +; Test commutativity - this should be optimized to 0 + +define i32 @clmul_commutative_cancel(i32 %a, i32 %b) { +; CHECK-LABEL: @clmul_commutative_cancel( +; CHECK-NEXT: ret i32 0 +; + %result1 = call i32 @llvm.clmul.i32(i32 %a, i32 %b) + %result2 = call i32 @llvm.clmul.i32(i32 %b, i32 %a) + %xor = xor i32 %result1, %result2 + ret i32 %xor +} + +; Test partial evaluation with mixed constants and variables + +define i32 @clmul_partial_const(i32 %a) { +; CHECK-LABEL: @clmul_partial_const( +; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.clmul.i32(i32 [[A:%.*]], i32 5) +; CHECK-NEXT: ret i32 [[RESULT]] +; + %result = call i32 @llvm.clmul.i32(i32 %a, i32 5) + ret i32 %result +} + +; Test with specific known values for manual verification + +define i8 @clmul_i8_manual_check1() { +; CHECK-LABEL: @clmul_i8_manual_check1( +; CHECK-NEXT: ret i8 2 +; + ; clmul(1, 2) = clmul(0b1, 0b10) = 0b10 = 2 + %result = call i8 @llvm.clmul.i8(i8 1, i8 2) + ret i8 %result +} + +define i8 @clmul_i8_manual_check2() { +; CHECK-LABEL: @clmul_i8_manual_check2( +; CHECK-NEXT: ret i8 6 +; + ; clmul(2, 3) = clmul(0b10, 0b11) = 0b110 = 6 + %result = call i8 @llvm.clmul.i8(i8 2, i8 3) + ret i8 %result +} + +define i8 @clmul_i8_manual_check3() { +; CHECK-LABEL: @clmul_i8_manual_check3( +; CHECK-NEXT: ret i8 -64 +; + ; clmul(12, 12) = clmul(0b1100, 0b1100) = 0b11000000 = 192 = -64 (i8) + %result = call i8 @llvm.clmul.i8(i8 12, i8 12) + ret i8 %result +} \ No newline at end of file