diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index eb2ef6bc35742..3a4191a7d2963 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -18155,6 +18155,54 @@ Example:
       %r = call i8 @llvm.fshr.i8(i8 15, i8 15, i8 11)  ; %r = i8: 225 (0b11100001)
       %r = call i8 @llvm.fshr.i8(i8 0, i8 255, i8 8)   ; %r = i8: 255 (0b11111111)
 
+.. clmul:
+
+'``clmul.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.clmul``
+on any integer bit width or vectors of integers.
+
+::
+
+      declare i16 @llvm.clmul.i16(i16 %a, i16 %b)
+      declare i32 @llvm.clmul.i32(i32 %a, i32 %b)
+      declare i64 @llvm.clmul.i64(i64 %a, i64 %b)
+      declare <4 x i32> @llvm.clmul.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+Overview
+"""""""""
+
+The '``llvm.clmul``' family of intrinsic functions performs carryless multiplication
+(also known as xor multiplication) on the 2 arguments.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+values that will undergo carryless multiplication.
+
+Semantics:
+""""""""""
+
+The ‘llvm.clmul’ intrinsic computes carryless multiply of ``%a`` and ``%b``, which is the result
+of applying the standard multiplication algorithm if you replace all of the additions with exclusive ors.
+The vector intrinsics, such as llvm.clmul.v4i32, operate on a per-element basis and the element order is not affected.
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+      %res = call i4 @llvm.clmul.i4(i4 1, i4 2)  ; %res = 2
+      %res = call i4 @llvm.clmul.i4(i4 5, i4 6)  ; %res = 14
+      %res = call i4 @llvm.clmul.i4(i4 -4, i4 2)  ; %res = -8
+      %res = call i4 @llvm.clmul.i4(i4 -4, i4 -5)  ; %res = -12
+
 Arithmetic with Overflow Intrinsics
 -----------------------------------
 
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 465e4a0a9d0d8..ffb71593af8bf 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -760,6 +760,9 @@ enum NodeType {
   ROTR,
   FSHL,
   FSHR,
+  
+  /// Carryless multiplication operator
+  CLMUL,
 
   /// Byte Swap and Counting operators.
   BSWAP,
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index cbdc1b6031680..b3fcb41400ed0 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5400,6 +5400,11 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
   /// \returns The expansion if successful, SDValue() otherwise
   SDValue expandFunnelShift(SDNode *N, SelectionDAG &DAG) const;
 
+  /// Expand carryless multiply.
+  /// \param N Node to expand
+  /// \returns The expansion if successful, SDValue() otherwise
+  SDValue expandCLMUL(SDNode *N, SelectionDAG &DAG) const;
+
   /// Expand rotations.
   /// \param N Node to expand
   /// \param AllowVectorOps expand vector rotate, this should only be performed
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index bd6f94ac1286c..f5ba9e4cbcd89 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1427,6 +1427,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
       [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
   def int_fshr : DefaultAttrsIntrinsic<[llvm_anyint_ty],
       [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
+  def int_clmul : DefaultAttrsIntrinsic<[llvm_anyint_ty],
+      [LLVMMatchType<0>, LLVMMatchType<0>]>;
 }
 
 let IntrProperties = [IntrNoMem, IntrSpeculatable,
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index a4ed62bb5715c..c6fe89044a623 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -437,6 +437,8 @@ def sra_parts  : SDNode<"ISD::SRA_PARTS" , SDTIntShiftPairOp>;
 def srl_parts  : SDNode<"ISD::SRL_PARTS" , SDTIntShiftPairOp>;
 def fshl       : SDNode<"ISD::FSHL"      , SDTIntShiftDOp>;
 def fshr       : SDNode<"ISD::FSHR"      , SDTIntShiftDOp>;
+def clmul      : SDNode<"ISD::CLMUL"     , SDTIntBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
 def and        : SDNode<"ISD::AND"       , SDTIntBinOp,
                         [SDNPCommutative, SDNPAssociative]>;
 def or         : SDNode<"ISD::OR"        , SDTIntBinOp,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index ba0ab2383d87a..597ca0e158032 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4017,6 +4017,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     if (SDValue Expanded = TLI.expandFunnelShift(Node, DAG))
       Results.push_back(Expanded);
     break;
+  case ISD::CLMUL:
+    Results.push_back(TLI.expandCLMUL(Node, DAG));
+    break;
   case ISD::ROTL:
   case ISD::ROTR:
     if (SDValue Expanded = TLI.expandROT(Node, true /*AllowVectorOps*/, DAG))
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 583a85a332dcd..51ef777ca6ecc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -209,7 +209,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::VP_XOR:
   case ISD::VP_ADD:
   case ISD::VP_SUB:
-  case ISD::VP_MUL:      Res = PromoteIntRes_SimpleIntBinOp(N); break;
+  case ISD::VP_MUL:
+  case ISD::CLMUL:      Res = PromoteIntRes_SimpleIntBinOp(N); break;
 
   case ISD::ABDS:
   case ISD::AVGCEILS:
@@ -3140,6 +3141,10 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
     ExpandIntRes_FunnelShift(N, Lo, Hi);
     break;
 
+  case ISD::CLMUL:
+    ExpandIntRes_CLMUL(N, Lo, Hi);
+    break;
+
   case ISD::VSCALE:
     ExpandIntRes_VSCALE(N, Lo, Hi);
     break;
@@ -5476,6 +5481,37 @@ void DAGTypeLegalizer::ExpandIntRes_FunnelShift(SDNode *N, SDValue &Lo,
   Hi = DAG.getNode(Opc, DL, HalfVT, Select3, Select2, NewShAmt);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_CLMUL(SDNode *N, SDValue &Lo,
+                                          SDValue &Hi) {
+  // Values numbered from least significant to most significant.
+  SDValue LL, LH, RL, RH;
+  GetExpandedInteger(N->getOperand(0), LL, LH);
+  GetExpandedInteger(N->getOperand(1), RL, RH);
+  EVT HalfVT = LL.getValueType();
+  SDLoc DL(N);
+  
+  // Lo is computed from the low half
+  Lo = DAG.getNode(ISD::CLMUL, DL, HalfVT, LL, RL);
+  // CLMUL is carryless so the high bits not included in CLMUL(A,B)
+  // can be computed by
+  // BITREVERSE(CLMUL(BITREVERSE(A), BITREVERSE(B))) >> 1
+  // Therefore we can compute the 2 hi/lo cross products
+  // and the the overflow of the low product
+  // and xor them together to compute HI
+  // TODO: if the target supports a widening CLMUL or a CLMULH we should probably use that
+  SDValue BitRevLL = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, LL);
+  SDValue BitRevRL = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, RL);
+  SDValue BitRevLoHi = DAG.getNode(ISD::CLMUL, DL, HalfVT, BitRevLL, BitRevRL);
+  SDValue LoHi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, BitRevLoHi);
+  SDValue One = DAG.getShiftAmountConstant(1, HalfVT, DL);
+  Hi = DAG.getNode(ISD::SRL, DL, HalfVT, LoHi, One);
+  
+  SDValue HiTmp = DAG.getNode(ISD::CLMUL, DL, HalfVT, LL, RH);
+  Hi = DAG.getNode(ISD::XOR, DL, HalfVT, Hi, HiTmp);
+  HiTmp = DAG.getNode(ISD::CLMUL, DL, HalfVT, LH, RL);
+  Hi = DAG.getNode(ISD::XOR, DL, HalfVT, Hi, HiTmp);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_VSCALE(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   EVT VT = N->getValueType(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 2e13b1854bf29..8c9c557771ddb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -511,6 +511,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
 
   void ExpandIntRes_Rotate            (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_FunnelShift       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CLMUL             (SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void ExpandIntRes_VSCALE            (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_READ_REGISTER(SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 1661814d5a897..a2c3c5b74b8de 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -166,6 +166,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
+  case ISD::CLMUL:
 
   case ISD::SADDSAT:
   case ISD::UADDSAT:
@@ -1330,6 +1331,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SMAX: case ISD::VP_SMAX:
   case ISD::UMIN: case ISD::VP_UMIN:
   case ISD::UMAX: case ISD::VP_UMAX:
+  case ISD::CLMUL:
   case ISD::SADDSAT: case ISD::VP_SADDSAT:
   case ISD::UADDSAT: case ISD::VP_UADDSAT:
   case ISD::SSUBSAT: case ISD::VP_SSUBSAT:
@@ -4764,6 +4766,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SSUBSAT: case ISD::VP_SSUBSAT:
   case ISD::SSHLSAT:
   case ISD::USHLSAT:
+  case ISD::CLMUL:
   case ISD::ROTL:
   case ISD::ROTR:
   case ISD::AVGFLOORS:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 5c586f73aa125..207c5398f8d61 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7521,6 +7521,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   case ISD::SSUBSAT:
   case ISD::UADDSAT:
   case ISD::USUBSAT:
+  case ISD::CLMUL:
     assert(VT.isInteger() && "This operator does not apply to FP types!");
     assert(N1.getValueType() == N2.getValueType() &&
            N1.getValueType() == VT && "Binary operator types must match!");
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 306e068f1c1da..f37191dfcc857 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7228,6 +7228,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     }
     return;
   }
+  case Intrinsic::clmul: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::CLMUL, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
   case Intrinsic::sadd_sat: {
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 900da7645504f..68550807e731b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -299,6 +299,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::ROTR:                       return "rotr";
   case ISD::FSHL:                       return "fshl";
   case ISD::FSHR:                       return "fshr";
+  case ISD::CLMUL:                      return "clmul";
   case ISD::FADD:                       return "fadd";
   case ISD::STRICT_FADD:                return "strict_fadd";
   case ISD::FSUB:                       return "fsub";
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 1764910861df4..f2106f43e443d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8277,6 +8277,44 @@ SDValue TargetLowering::expandFunnelShift(SDNode *Node,
   return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
 }
 
+SDValue TargetLowering::expandCLMUL(SDNode *Node,
+                                    SelectionDAG &DAG) const {
+  SDLoc DL(Node);
+  EVT VT = Node->getValueType(0);
+  SDValue V1 = Node->getOperand(0);
+  SDValue V2 = Node->getOperand(1);
+  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+
+  EVT SetCCType =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  // Only expand vector types if we have the appropriate vector bit operations.
+  // FIXME: Should really try to split the vector in case it's legal on a
+  // subvector.
+  if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) ||
+                        (!isOperationLegalOrCustom(ISD::SRL, VT) ||
+                        !isOperationLegalOrCustom(ISD::SHL, VT) ||
+                        !isOperationLegalOrCustom(ISD::XOR, VT) ||
+                        !isOperationLegalOrCustom(ISD::AND, VT) ||
+                        !isOperationLegalOrCustom(ISD::SELECT, VT))))
+    return DAG.UnrollVectorOp(Node);
+
+  SDValue Res = DAG.getConstant(0, DL, VT);
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue One = DAG.getConstant(1, DL, VT);
+  SDValue OneForShift = DAG.getShiftAmountConstant(1, VT, DL);
+  for (unsigned I = 0; I < NumBitsPerElt; ++I) {
+    SDValue LowBit = DAG.getNode(ISD::AND, DL, VT, V1, One);
+    SDValue LowBool = DAG.getSetCC(DL, SetCCType, LowBit, Zero, ISD::SETNE);
+    SDValue Pred = DAG.getNode(ISD::SELECT, DL, VT, LowBool, V2, Zero);
+    Res = DAG.getNode(ISD::XOR, DL, VT, Res, Pred);
+    if (I != NumBitsPerElt - 1) {
+      V1 = DAG.getNode(ISD::SRL, DL, VT, V1, OneForShift);
+      V2 = DAG.getNode(ISD::SHL, DL, VT, V2, OneForShift);
+    }
+  }
+  return Res;
+}
+
 // TODO: Merge with expandFunnelShift.
 SDValue TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps,
                                   SelectionDAG &DAG) const {
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 3c91b0eb4e2ea..725fa9cacdd04 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -844,6 +844,9 @@ void TargetLoweringBase::initActions() {
     // Absolute difference
     setOperationAction({ISD::ABDS, ISD::ABDU}, VT, Expand);
 
+    // Carryless multiply
+    setOperationAction(ISD::CLMUL, VT, Expand);
+
     // Saturated trunc
     setOperationAction(ISD::TRUNCATE_SSAT_S, VT, Expand);
     setOperationAction(ISD::TRUNCATE_SSAT_U, VT, Expand);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index dbe992208c9f3..9af4f3adbffa9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -402,6 +402,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                        Legal);
   }
 
+  if (Subtarget.hasStdExtZbc() || Subtarget.hasStdExtZbkc()) {
+    setOperationAction(ISD::CLMUL, XLenVT, Legal);
+  }
+
   if (Subtarget.hasStdExtZbb() ||
       (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit())) {
     if (Subtarget.is64Bit())
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index d2a651444169c..666ae032228aa 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -799,6 +799,7 @@ def : Sh3AddPat<SH3ADD>;
 } // Predicates = [HasStdExtZba, IsRV64]
 
 let Predicates = [HasStdExtZbcOrZbkc] in {
+def : PatGprGpr<clmul, CLMUL>;
 def : PatGprGpr<riscv_clmul, CLMUL>;
 def : PatGprGpr<riscv_clmulh, CLMULH>;
 } // Predicates = [HasStdExtZbcOrZbkc]
diff --git a/llvm/test/CodeGen/Generic/clmul-expand.ll b/llvm/test/CodeGen/Generic/clmul-expand.ll
new file mode 100644
index 0000000000000..21aeaf9195a1f
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/clmul-expand.ll
@@ -0,0 +1,190 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=riscv32 -mattr=-zbc,-zbkc -verify-machineinstrs | FileCheck %s --check-prefix=RV32-EXPAND
+; RUN: llc < %s -mtriple=riscv64 -mattr=-zbc,-zbkc -verify-machineinstrs | FileCheck %s --check-prefix=RV64-EXPAND
+; RUN: llc < %s -mtriple=x86_64 -verify-machineinstrs | FileCheck %s --check-prefix=X64-EXPAND
+
+; Test CLMUL expansion when the instruction is not natively supported
+
+declare i8 @llvm.clmul.i8(i8 %a, i8 %b)
+declare i16 @llvm.clmul.i16(i16 %a, i16 %b)
+declare i32 @llvm.clmul.i32(i32 %a, i32 %b)
+declare i64 @llvm.clmul.i64(i64 %a, i64 %b)
+declare i128 @llvm.clmul.i128(i128 %a, i128 %b)
+
+define i8 @clmul_expand_i8(i8 %a, i8 %b) nounwind {
+; RV32-EXPAND-LABEL: clmul_expand_i8:
+; RV32-EXPAND:       # %bb.0:
+; RV32-EXPAND-NEXT:    andi a2, a0, 1
+; RV32-EXPAND-NEXT:    beqz a2, .LBB0_2
+; RV32-EXPAND-NEXT:  # %bb.1:
+; RV32-EXPAND-NEXT:    mv a2, a1
+; RV32-EXPAND-NEXT:    j .LBB0_3
+; RV32-EXPAND-NEXT:  .LBB0_2:
+; RV32-EXPAND-NEXT:    li a2, 0
+; RV32-EXPAND-NEXT:  .LBB0_3:
+; RV32-EXPAND-NEXT:    srli a0, a0, 1
+; RV32-EXPAND-NEXT:    slli a1, a1, 1
+; RV32-EXPAND-NEXT:    andi a3, a0, 1
+; RV32-EXPAND-NEXT:    beqz a3, .LBB0_5
+; RV32-EXPAND-NEXT:  # %bb.4:
+; RV32-EXPAND-NEXT:    xor a2, a2, a1
+; RV32-EXPAND-NEXT:  .LBB0_5:
+; RV32-EXPAND-NEXT:    srli a0, a0, 1
+; RV32-EXPAND-NEXT:    slli a1, a1, 1
+; RV32-EXPAND-NEXT:    andi a3, a0, 1
+; RV32-EXPAND-NEXT:    beqz a3, .LBB0_7
+; RV32-EXPAND-NEXT:  # %bb.6:
+; RV32-EXPAND-NEXT:    xor a2, a2, a1
+; RV32-EXPAND-NEXT:  .LBB0_7:
+; RV32-EXPAND-NEXT:    srli a0, a0, 1
+; RV32-EXPAND-NEXT:    slli a1, a1, 1
+; RV32-EXPAND-NEXT:    andi a3, a0, 1
+; RV32-EXPAND-NEXT:    beqz a3, .LBB0_9
+; RV32-EXPAND-NEXT:  # %bb.8:
+; RV32-EXPAND-NEXT:    xor a2, a2, a1
+; RV32-EXPAND-NEXT:  .LBB0_9:
+; RV32-EXPAND-NEXT:    srli a0, a0, 1
+; RV32-EXPAND-NEXT:    slli a1, a1, 1
+; RV32-EXPAND-NEXT:    andi a3, a0, 1
+; RV32-EXPAND-NEXT:    beqz a3, .LBB0_11
+; RV32-EXPAND-NEXT:  # %bb.10:
+; RV32-EXPAND-NEXT:    xor a2, a2, a1
+; RV32-EXPAND-NEXT:  .LBB0_11:
+; RV32-EXPAND-NEXT:    srli a0, a0, 1
+; RV32-EXPAND-NEXT:    slli a1, a1, 1
+; RV32-EXPAND-NEXT:    andi a3, a0, 1
+; RV32-EXPAND-NEXT:    beqz a3, .LBB0_13
+; RV32-EXPAND-NEXT:  # %bb.12:
+; RV32-EXPAND-NEXT:    xor a2, a2, a1
+; RV32-EXPAND-NEXT:  .LBB0_13:
+; RV32-EXPAND-NEXT:    srli a0, a0, 1
+; RV32-EXPAND-NEXT:    slli a1, a1, 1
+; RV32-EXPAND-NEXT:    andi a3, a0, 1
+; RV32-EXPAND-NEXT:    beqz a3, .LBB0_15
+; RV32-EXPAND-NEXT:  # %bb.14:
+; RV32-EXPAND-NEXT:    xor a2, a2, a1
+; RV32-EXPAND-NEXT:  .LBB0_15:
+; RV32-EXPAND-NEXT:    andi a0, a0, 2
+; RV32-EXPAND-NEXT:    beqz a0, .LBB0_17
+; RV32-EXPAND-NEXT:  # %bb.16:
+; RV32-EXPAND-NEXT:    slli a0, a1, 1
+; RV32-EXPAND-NEXT:    xor a2, a2, a0
+; RV32-EXPAND-NEXT:  .LBB0_17:
+; RV32-EXPAND-NEXT:    mv a0, a2
+; RV32-EXPAND-NEXT:    ret
+  %result = call i8 @llvm.clmul.i8(i8 %a, i8 %b)
+  ret i8 %result
+}
+
+define i16 @clmul_expand_i16(i16 %a, i16 %b) nounwind {
+; RV32-EXPAND-LABEL: clmul_expand_i16:
+; RV32-EXPAND:       # %bb.0:
+; RV32-EXPAND-NEXT:    andi a2, a0, 1
+; RV32-EXPAND-NEXT:    beqz a2, .LBB1_2
+; RV32-EXPAND-NEXT:  # %bb.1:
+; RV32-EXPAND-NEXT:    mv a2, a1
+; RV32-EXPAND-NEXT:    j .LBB1_3
+; RV32-EXPAND-NEXT:  .LBB1_2:
+; RV32-EXPAND-NEXT:    li a2, 0
+; RV32-EXPAND-NEXT:  .LBB1_3:  
+; RV32-EXPAND-NEXT:    srli a0, a0, 1
+; RV32-EXPAND-NEXT:    slli a1, a1, 1
+  %result = call i16 @llvm.clmul.i16(i16 %a, i16 %b)
+  ret i16 %result
+}
+
+define i32 @clmul_expand_i32(i32 %a, i32 %b) nounwind {
+; RV32-EXPAND-LABEL: clmul_expand_i32:
+; RV32-EXPAND:       # %bb.0:
+; RV32-EXPAND-NEXT:    andi a2, a0, 1
+; RV32-EXPAND-NEXT:    beqz a2, .LBB2_2
+; RV32-EXPAND-NEXT:  # %bb.1:
+; RV32-EXPAND-NEXT:    mv a2, a1
+; RV32-EXPAND-NEXT:    j .LBB2_3
+; RV32-EXPAND-NEXT:  .LBB2_2:
+; RV32-EXPAND-NEXT:    li a2, 0
+; RV32-EXPAND-NEXT:  .LBB2_3:
+; RV32-EXPAND-NEXT:    srli a0, a0, 1
+; RV32-EXPAND-NEXT:    slli a1, a1, 1
+  %result = call i32 @llvm.clmul.i32(i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i64 @clmul_expand_i64(i64 %a, i64 %b) nounwind {
+; RV32-EXPAND-LABEL: clmul_expand_i64:
+; RV32-EXPAND:       # %bb.0:
+; RV32-EXPAND-NEXT:    addi sp, sp, -16
+; RV32-EXPAND-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-EXPAND-NEXT:    call __clmuldi3
+; RV32-EXPAND-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-EXPAND-NEXT:    addi sp, sp, 16
+; RV32-EXPAND-NEXT:    ret
+;
+; RV64-EXPAND-LABEL: clmul_expand_i64:
+; RV64-EXPAND:       # %bb.0:
+; RV64-EXPAND-NEXT:    andi a2, a0, 1
+; RV64-EXPAND-NEXT:    beqz a2, .LBB3_2
+; RV64-EXPAND-NEXT:  # %bb.1:
+; RV64-EXPAND-NEXT:    mv a2, a1
+; RV64-EXPAND-NEXT:    j .LBB3_3
+; RV64-EXPAND-NEXT:  .LBB3_2:
+; RV64-EXPAND-NEXT:    li a2, 0
+; RV64-EXPAND-NEXT:  .LBB3_3:
+; RV64-EXPAND-NEXT:    srli a0, a0, 1
+; RV64-EXPAND-NEXT:    slli a1, a1, 1
+  %result = call i64 @llvm.clmul.i64(i64 %a, i64 %b)
+  ret i64 %result
+}
+
+define i128 @clmul_expand_i128(i128 %a, i128 %b) nounwind {
+; RV32-EXPAND-LABEL: clmul_expand_i128:
+; RV32-EXPAND:       # %bb.0:
+; RV32-EXPAND-NEXT:    addi sp, sp, -16
+; RV32-EXPAND-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-EXPAND-NEXT:    call __clmulti3
+; RV32-EXPAND-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-EXPAND-NEXT:    addi sp, sp, 16
+; RV32-EXPAND-NEXT:    ret
+;
+; RV64-EXPAND-LABEL: clmul_expand_i128:
+; RV64-EXPAND:       # %bb.0:
+; RV64-EXPAND-NEXT:    addi sp, sp, -16
+; RV64-EXPAND-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-EXPAND-NEXT:    call __clmulti3
+; RV64-EXPAND-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-EXPAND-NEXT:    addi sp, sp, 16
+; RV64-EXPAND-NEXT:    ret
+  %result = call i128 @llvm.clmul.i128(i128 %a, i128 %b)
+  ret i128 %result
+}
+
+; Test with known constants to verify correctness
+define i8 @clmul_expand_i8_known() nounwind {
+; RV32-EXPAND-LABEL: clmul_expand_i8_known:
+; RV32-EXPAND:       # %bb.0:
+; RV32-EXPAND-NEXT:    li a0, 14
+; RV32-EXPAND-NEXT:    ret
+;
+; RV64-EXPAND-LABEL: clmul_expand_i8_known:
+; RV64-EXPAND:       # %bb.0:
+; RV64-EXPAND-NEXT:    li a0, 14
+; RV64-EXPAND-NEXT:    ret
+  ; clmul(5, 6) = clmul(0b101, 0b110) = 0b1110 = 14
+  %result = call i8 @llvm.clmul.i8(i8 5, i8 6)
+  ret i8 %result
+}
+
+define i16 @clmul_expand_i16_known() nounwind {
+; RV32-EXPAND-LABEL: clmul_expand_i16_known:
+; RV32-EXPAND:       # %bb.0:
+; RV32-EXPAND-NEXT:    li a0, 158
+; RV32-EXPAND-NEXT:    ret
+;
+; RV64-EXPAND-LABEL: clmul_expand_i16_known:
+; RV64-EXPAND:       # %bb.0:
+; RV64-EXPAND-NEXT:    li a0, 158
+; RV64-EXPAND-NEXT:    ret
+  ; clmul(15, 13) = clmul(0b1111, 0b1101) = 0b10011110 = 158
+  %result = call i16 @llvm.clmul.i16(i16 15, i16 13)
+  ret i16 %result
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/Generic/clmul-vector.ll b/llvm/test/CodeGen/Generic/clmul-vector.ll
new file mode 100644
index 0000000000000..2bcf77f32f68c
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/clmul-vector.ll
@@ -0,0 +1,178 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=riscv32 -mattr=-zbc,-zbkc -verify-machineinstrs | FileCheck %s --check-prefix=RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=-zbc,-zbkc -verify-machineinstrs | FileCheck %s --check-prefix=RV64
+; RUN: llc < %s -mtriple=x86_64 -verify-machineinstrs | FileCheck %s --check-prefix=X64
+
+; Test CLMUL with vector types
+
+declare <2 x i32> @llvm.clmul.v2i32(<2 x i32> %a, <2 x i32> %b)
+declare <4 x i32> @llvm.clmul.v4i32(<4 x i32> %a, <4 x i32> %b)
+declare <2 x i64> @llvm.clmul.v2i64(<2 x i64> %a, <2 x i64> %b)
+declare <8 x i16> @llvm.clmul.v8i16(<8 x i16> %a, <8 x i16> %b)
+declare <16 x i8> @llvm.clmul.v16i8(<16 x i8> %a, <16 x i8> %b)
+
+define <2 x i32> @clmul_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
+; RV32-LABEL: clmul_v2i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv s0, a3
+; RV32-NEXT:    mv s1, a2
+; RV32-NEXT:    mv s2, a1
+; RV32-NEXT:    mv s3, a0
+; RV32-NEXT:    andi a0, a3, 1
+; RV32-NEXT:    beqz a0, .LBB0_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv s4, s1
+; RV32-NEXT:    j .LBB0_3
+; RV32-NEXT:  .LBB0_2:
+; RV32-NEXT:    li s4, 0
+; RV32-NEXT:  .LBB0_3:
+; RV32-NEXT:    srli s0, s0, 1
+; RV32-NEXT:    slli s1, s1, 1
+  %result = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i32> %result
+}
+
+define <4 x i32> @clmul_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; RV32-LABEL: clmul_v4i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -80
+; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 68(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 64(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
+  %result = call <4 x i32> @llvm.clmul.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+define <2 x i64> @clmul_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; RV32-LABEL: clmul_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw s0, 64(sp)
+; RV32-NEXT:    lw s1, 68(sp)
+; RV32-NEXT:    lw s2, 72(sp)
+; RV32-NEXT:    lw s3, 76(sp)
+; RV32-NEXT:    mv s4, a0
+; RV32-NEXT:    mv s5, a1
+; RV32-NEXT:    mv s6, a2
+; RV32-NEXT:    mv s7, a3
+; RV32-NEXT:    mv a0, s6
+; RV32-NEXT:    mv a1, s7
+; RV32-NEXT:    mv a2, s2
+; RV32-NEXT:    mv a3, s3
+; RV32-NEXT:    call __clmuldi3
+; RV32-NEXT:    mv s8, a0
+; RV32-NEXT:    mv s2, a1
+; RV32-NEXT:    mv a0, s4
+; RV32-NEXT:    mv a1, s5
+; RV32-NEXT:    mv a2, s0
+; RV32-NEXT:    mv a3, s1
+; RV32-NEXT:    call __clmuldi3
+; RV32-NEXT:    mv a2, a0
+; RV32-NEXT:    mv a3, a1
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    mv a2, s8
+; RV32-NEXT:    mv a3, s2
+; RV32-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    ret
+  %result = call <2 x i64> @llvm.clmul.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %result
+}
+
+define <8 x i16> @clmul_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; RV32-LABEL: clmul_v8i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -144
+; RV32-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
+  %result = call <8 x i16> @llvm.clmul.v8i16(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %result
+}
+
+define <16 x i8> @clmul_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; RV32-LABEL: clmul_v16i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -272
+; RV32-NEXT:    sw ra, 268(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 264(sp) # 4-byte Folded Spill
+  %result = call <16 x i8> @llvm.clmul.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %result
+}
+
+; Test with splat vectors  
+define <2 x i32> @clmul_v2i32_splat(<2 x i32> %a) nounwind {
+; RV32-LABEL: clmul_v2i32_splat:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv s0, a1
+; RV32-NEXT:    mv s1, a0
+; RV32-NEXT:    andi a0, a1, 1
+; RV32-NEXT:    li s2, 3
+; RV32-NEXT:    beqz a0, .LBB5_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv s3, s2
+; RV32-NEXT:    j .LBB5_3
+; RV32-NEXT:  .LBB5_2:
+; RV32-NEXT:    li s3, 0
+; RV32-NEXT:  .LBB5_3:
+; RV32-NEXT:    srli s0, s0, 1
+; RV32-NEXT:    slli s2, s2, 1
+  %splat = insertelement <2 x i32> poison, i32 3, i32 0
+  %splat_vec = shufflevector <2 x i32> %splat, <2 x i32> poison, <2 x i32> zeroinitializer
+  %result = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> %a, <2 x i32> %splat_vec)
+  ret <2 x i32> %result
+}
+
+; Test with constant vectors
+define <2 x i32> @clmul_v2i32_const() nounwind {
+; RV32-LABEL: clmul_v2i32_const:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a0, 14
+; RV32-NEXT:    li a1, 6
+; RV32-NEXT:    ret
+  %result = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> <i32 5, i32 2>, <2 x i32> <i32 6, i32 3>)
+  ret <2 x i32> %result
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/RISCV/clmul-intrinsic.ll b/llvm/test/CodeGen/RISCV/clmul-intrinsic.ll
new file mode 100644
index 0000000000000..291509c343efe
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/clmul-intrinsic.ll
@@ -0,0 +1,166 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+zbc -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32ZBC
+; RUN: llc -mtriple=riscv32 -mattr=+zbkc -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32ZBC
+; RUN: llc -mtriple=riscv64 -mattr=+zbc -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64ZBC
+; RUN: llc -mtriple=riscv64 -mattr=+zbkc -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64ZBC
+
+; Test generic llvm.clmul intrinsic
+
+declare i8 @llvm.clmul.i8(i8 %a, i8 %b)
+declare i16 @llvm.clmul.i16(i16 %a, i16 %b)  
+declare i32 @llvm.clmul.i32(i32 %a, i32 %b)
+declare i64 @llvm.clmul.i64(i64 %a, i64 %b)
+
+define i8 @clmul_i8(i8 %a, i8 %b) nounwind {
+; RV32ZBC-LABEL: clmul_i8:
+; RV32ZBC:       # %bb.0:
+; RV32ZBC-NEXT:    clmul a0, a0, a1
+; RV32ZBC-NEXT:    ret
+;
+; RV64ZBC-LABEL: clmul_i8:
+; RV64ZBC:       # %bb.0:
+; RV64ZBC-NEXT:    clmul a0, a0, a1
+; RV64ZBC-NEXT:    ret
+  %result = call i8 @llvm.clmul.i8(i8 %a, i8 %b)
+  ret i8 %result
+}
+
+define i16 @clmul_i16(i16 %a, i16 %b) nounwind {
+; RV32ZBC-LABEL: clmul_i16:
+; RV32ZBC:       # %bb.0:
+; RV32ZBC-NEXT:    clmul a0, a0, a1
+; RV32ZBC-NEXT:    ret
+;
+; RV64ZBC-LABEL: clmul_i16:
+; RV64ZBC:       # %bb.0:
+; RV64ZBC-NEXT:    clmul a0, a0, a1
+; RV64ZBC-NEXT:    ret
+  %result = call i16 @llvm.clmul.i16(i16 %a, i16 %b)
+  ret i16 %result
+}
+
+define i32 @clmul_i32(i32 %a, i32 %b) nounwind {
+; RV32ZBC-LABEL: clmul_i32:
+; RV32ZBC:       # %bb.0:
+; RV32ZBC-NEXT:    clmul a0, a0, a1
+; RV32ZBC-NEXT:    ret
+;
+; RV64ZBC-LABEL: clmul_i32:
+; RV64ZBC:       # %bb.0:
+; RV64ZBC-NEXT:    clmul a0, a0, a1
+; RV64ZBC-NEXT:    ret
+  %result = call i32 @llvm.clmul.i32(i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i64 @clmul_i64_rv32(i64 %a, i64 %b) nounwind {
+; RV32ZBC-LABEL: clmul_i64_rv32:
+; RV32ZBC:       # %bb.0:
+; RV32ZBC-NEXT:    addi sp, sp, -16
+; RV32ZBC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32ZBC-NEXT:    call __clmuldi3
+; RV32ZBC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32ZBC-NEXT:    addi sp, sp, 16
+; RV32ZBC-NEXT:    ret
+  %result = call i64 @llvm.clmul.i64(i64 %a, i64 %b)
+  ret i64 %result
+}
+
+define i64 @clmul_i64_rv64(i64 %a, i64 %b) nounwind {
+; RV64ZBC-LABEL: clmul_i64_rv64:
+; RV64ZBC:       # %bb.0:
+; RV64ZBC-NEXT:    clmul a0, a0, a1
+; RV64ZBC-NEXT:    ret
+  %result = call i64 @llvm.clmul.i64(i64 %a, i64 %b)
+  ret i64 %result
+}
+
+; Test with constants
+define i32 @clmul_i32_const() nounwind {
+; RV32ZBC-LABEL: clmul_i32_const:
+; RV32ZBC:       # %bb.0:
+; RV32ZBC-NEXT:    li a0, 14
+; RV32ZBC-NEXT:    ret
+;
+; RV64ZBC-LABEL: clmul_i32_const:
+; RV64ZBC:       # %bb.0:
+; RV64ZBC-NEXT:    li a0, 14
+; RV64ZBC-NEXT:    ret
+  %result = call i32 @llvm.clmul.i32(i32 5, i32 6)
+  ret i32 %result
+}
+
+; Test with zero operands
+define i32 @clmul_i32_zero_left(i32 %b) nounwind {
+; RV32ZBC-LABEL: clmul_i32_zero_left:
+; RV32ZBC:       # %bb.0:
+; RV32ZBC-NEXT:    li a0, 0
+; RV32ZBC-NEXT:    ret
+;
+; RV64ZBC-LABEL: clmul_i32_zero_left:
+; RV64ZBC:       # %bb.0:
+; RV64ZBC-NEXT:    li a0, 0
+; RV64ZBC-NEXT:    ret
+  %result = call i32 @llvm.clmul.i32(i32 0, i32 %b)
+  ret i32 %result
+}
+
+define i32 @clmul_i32_zero_right(i32 %a) nounwind {
+; RV32ZBC-LABEL: clmul_i32_zero_right:
+; RV32ZBC:       # %bb.0:
+; RV32ZBC-NEXT:    li a0, 0
+; RV32ZBC-NEXT:    ret
+;
+; RV64ZBC-LABEL: clmul_i32_zero_right:
+; RV64ZBC:       # %bb.0:
+; RV64ZBC-NEXT:    li a0, 0
+; RV64ZBC-NEXT:    ret
+  %result = call i32 @llvm.clmul.i32(i32 %a, i32 0)
+  ret i32 %result
+}
+
+; Test with identity (multiplying by 1)
+define i32 @clmul_i32_identity_left(i32 %a) nounwind {
+; RV32ZBC-LABEL: clmul_i32_identity_left:
+; RV32ZBC:       # %bb.0:
+; RV32ZBC-NEXT:    ret
+;
+; RV64ZBC-LABEL: clmul_i32_identity_left:
+; RV64ZBC:       # %bb.0:
+; RV64ZBC-NEXT:    ret
+  %result = call i32 @llvm.clmul.i32(i32 1, i32 %a)
+  ret i32 %result
+}
+
+define i32 @clmul_i32_identity_right(i32 %a) nounwind {
+; RV32ZBC-LABEL: clmul_i32_identity_right:
+; RV32ZBC:       # %bb.0:
+; RV32ZBC-NEXT:    ret
+;
+; RV64ZBC-LABEL: clmul_i32_identity_right:
+; RV64ZBC:       # %bb.0:
+; RV64ZBC-NEXT:    ret
+  %result = call i32 @llvm.clmul.i32(i32 %a, i32 1)
+  ret i32 %result
+}
+
+; Test commutativity
+define i32 @clmul_i32_commutative(i32 %a, i32 %b) nounwind {
+; RV32ZBC-LABEL: clmul_i32_commutative:
+; RV32ZBC:       # %bb.0:
+; RV32ZBC-NEXT:    clmul a0, a0, a1
+; RV32ZBC-NEXT:    ret
+;
+; RV64ZBC-LABEL: clmul_i32_commutative:
+; RV64ZBC:       # %bb.0:
+; RV64ZBC-NEXT:    clmul a0, a0, a1
+; RV64ZBC-NEXT:    ret
+  %result1 = call i32 @llvm.clmul.i32(i32 %a, i32 %b)
+  %result2 = call i32 @llvm.clmul.i32(i32 %b, i32 %a)
+  %xor = xor i32 %result1, %result2
+  ret i32 %xor
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/RISCV/rv32zbc-zbkc-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv32zbc-zbkc-intrinsic.ll
index 51fd086e26dfe..cb190f8ee90c2 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbc-zbkc-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbc-zbkc-intrinsic.ll
@@ -15,6 +15,17 @@ define i32 @clmul32(i32 %a, i32 %b) nounwind {
   ret i32 %tmp
 }
 
+declare i32 @llvm.clmul.i32(i32 %a, i32 %b)
+
+define i32 @generic_clmul32(i32 %a, i32 %b) nounwind {
+; RV32ZBC-ZBKC-LABEL: generic_clmul32:
+; RV32ZBC-ZBKC:       # %bb.0:
+; RV32ZBC-ZBKC-NEXT:    clmul a0, a0, a1
+; RV32ZBC-ZBKC-NEXT:    ret
+  %tmp = call i32 @llvm.clmul.i32(i32 %a, i32 %b)
+  ret i32 %tmp
+}
+
 declare i32 @llvm.riscv.clmulh.i32(i32 %a, i32 %b)
 
 define i32 @clmul32h(i32 %a, i32 %b) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll
index aa9e89bc20953..7a535e93791cb 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll
@@ -15,6 +15,17 @@ define i64 @clmul64(i64 %a, i64 %b) nounwind {
   ret i64 %tmp
 }
 
+declare i64 @llvm.clmul.i64(i64 %a, i64 %b)
+
+define i64 @generic_clmul64(i64 %a, i64 %b) nounwind {
+; RV64ZBC-ZBKC-LABEL: generic_clmul64:
+; RV64ZBC-ZBKC:       # %bb.0:
+; RV64ZBC-ZBKC-NEXT:    clmul a0, a0, a1
+; RV64ZBC-ZBKC-NEXT:    ret
+  %tmp = call i64 @llvm.clmul.i64(i64 %a, i64 %b)
+  ret i64 %tmp
+}
+
 declare i64 @llvm.riscv.clmulh.i64(i64 %a, i64 %b)
 
 define i64 @clmul64h(i64 %a, i64 %b) nounwind {
@@ -37,6 +48,17 @@ define signext i32 @clmul32(i32 signext %a, i32 signext %b) nounwind {
   %tmp = call i32 @llvm.riscv.clmul.i32(i32 %a, i32 %b)
   ret i32 %tmp
 }
+declare i32 @llvm.clmul.i32(i32 %a, i32 %b)
+
+define signext i32 @generic_clmul32(i32 signext %a, i32 signext %b) nounwind {
+; RV64ZBC-ZBKC-LABEL: generic_clmul32:
+; RV64ZBC-ZBKC:       # %bb.0:
+; RV64ZBC-ZBKC-NEXT:    clmul a0, a0, a1
+; RV64ZBC-ZBKC-NEXT:    sext.w a0, a0
+; RV64ZBC-ZBKC-NEXT:    ret
+  %tmp = call i32 @llvm.clmul.i32(i32 %a, i32 %b)
+  ret i32 %tmp
+}
 
 declare i32 @llvm.riscv.clmulh.i32(i32 %a, i32 %b)
 
diff --git a/llvm/test/Transforms/InstCombine/clmul.ll b/llvm/test/Transforms/InstCombine/clmul.ll
new file mode 100644
index 0000000000000..cdfe0e93d1e31
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/clmul.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+declare i8 @llvm.clmul.i8(i8 %a, i8 %b)
+declare i16 @llvm.clmul.i16(i16 %a, i16 %b)
+declare i32 @llvm.clmul.i32(i32 %a, i32 %b)
+declare i64 @llvm.clmul.i64(i64 %a, i64 %b)
+declare <2 x i32> @llvm.clmul.v2i32(<2 x i32> %a, <2 x i32> %b)
+declare <4 x i32> @llvm.clmul.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+; Test constant folding
+
+define i8 @clmul_i8_const() {
+; CHECK-LABEL: @clmul_i8_const(
+; CHECK-NEXT:    ret i8 14
+;
+  %result = call i8 @llvm.clmul.i8(i8 5, i8 6)
+  ret i8 %result
+}
+
+define i16 @clmul_i16_const() {
+; CHECK-LABEL: @clmul_i16_const(
+; CHECK-NEXT:    ret i16 158
+;
+  ; clmul(15, 13) = clmul(0b1111, 0b1101) = 0b10011110 = 158
+  %result = call i16 @llvm.clmul.i16(i16 15, i16 13)
+  ret i16 %result
+}
+
+define i32 @clmul_i32_const() {
+; CHECK-LABEL: @clmul_i32_const(
+; CHECK-NEXT:    ret i32 14
+;
+  %result = call i32 @llvm.clmul.i32(i32 5, i32 6)
+  ret i32 %result
+}
+
+define i64 @clmul_i64_const() {
+; CHECK-LABEL: @clmul_i64_const(
+; CHECK-NEXT:    ret i64 398
+;
+  ; clmul(31, 17) = clmul(0b11111, 0b10001) = 0b110001110 = 398
+  %result = call i64 @llvm.clmul.i64(i64 31, i64 17) 
+  ret i64 %result
+}
+
+; Test zero operands
+
+define i32 @clmul_zero_left(i32 %a) {
+; CHECK-LABEL: @clmul_zero_left(
+; CHECK-NEXT:    ret i32 0
+;
+  %result = call i32 @llvm.clmul.i32(i32 0, i32 %a)
+  ret i32 %result
+}
+
+define i32 @clmul_zero_right(i32 %a) {
+; CHECK-LABEL: @clmul_zero_right(
+; CHECK-NEXT:    ret i32 0
+;
+  %result = call i32 @llvm.clmul.i32(i32 %a, i32 0)
+  ret i32 %result
+}
+
+; Test identity (multiply by 1)
+
+define i32 @clmul_identity_left(i32 %a) {
+; CHECK-LABEL: @clmul_identity_left(
+; CHECK-NEXT:    ret i32 [[A:%.*]]
+;
+  %result = call i32 @llvm.clmul.i32(i32 1, i32 %a)
+  ret i32 %result
+}
+
+define i32 @clmul_identity_right(i32 %a) {
+; CHECK-LABEL: @clmul_identity_right(
+; CHECK-NEXT:    ret i32 [[A:%.*]]
+;
+  %result = call i32 @llvm.clmul.i32(i32 %a, i32 1)
+  ret i32 %result
+}
+
+; Test with vector constants
+
+define <2 x i32> @clmul_v2i32_const() {
+; CHECK-LABEL: @clmul_v2i32_const(
+; CHECK-NEXT:    ret <2 x i32> <i32 14, i32 6>
+;
+  %result = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> <i32 5, i32 2>, <2 x i32> <i32 6, i32 3>)
+  ret <2 x i32> %result
+}
+
+define <2 x i32> @clmul_v2i32_zero_left(<2 x i32> %a) {
+; CHECK-LABEL: @clmul_v2i32_zero_left(
+; CHECK-NEXT:    ret <2 x i32> zeroinitializer
+;
+  %result = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> zeroinitializer, <2 x i32> %a)
+  ret <2 x i32> %result
+}
+
+define <2 x i32> @clmul_v2i32_zero_right(<2 x i32> %a) {
+; CHECK-LABEL: @clmul_v2i32_zero_right(
+; CHECK-NEXT:    ret <2 x i32> zeroinitializer
+;
+  %result = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> %a, <2 x i32> zeroinitializer)
+  ret <2 x i32> %result
+}
+
+define <2 x i32> @clmul_v2i32_identity_left(<2 x i32> %a) {
+; CHECK-LABEL: @clmul_v2i32_identity_left(
+; CHECK-NEXT:    ret <2 x i32> [[A:%.*]]
+;
+  %result = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> <i32 1, i32 1>, <2 x i32> %a)
+  ret <2 x i32> %result
+}
+
+define <2 x i32> @clmul_v2i32_identity_right(<2 x i32> %a) {
+; CHECK-LABEL: @clmul_v2i32_identity_right(
+; CHECK-NEXT:    ret <2 x i32> [[A:%.*]]
+;
+  %result = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> %a, <2 x i32> <i32 1, i32 1>)
+  ret <2 x i32> %result
+}
+
+; Test commutativity - this should be optimized to 0
+
+define i32 @clmul_commutative_cancel(i32 %a, i32 %b) {
+; CHECK-LABEL: @clmul_commutative_cancel(
+; CHECK-NEXT:    ret i32 0
+;
+  %result1 = call i32 @llvm.clmul.i32(i32 %a, i32 %b)
+  %result2 = call i32 @llvm.clmul.i32(i32 %b, i32 %a)
+  %xor = xor i32 %result1, %result2
+  ret i32 %xor
+}
+
+; Test partial evaluation with mixed constants and variables
+
+define i32 @clmul_partial_const(i32 %a) {
+; CHECK-LABEL: @clmul_partial_const(
+; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.clmul.i32(i32 [[A:%.*]], i32 5)
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %result = call i32 @llvm.clmul.i32(i32 %a, i32 5)
+  ret i32 %result
+}
+
+; Test with specific known values for manual verification
+
+define i8 @clmul_i8_manual_check1() {
+; CHECK-LABEL: @clmul_i8_manual_check1(
+; CHECK-NEXT:    ret i8 2
+;
+  ; clmul(1, 2) = clmul(0b1, 0b10) = 0b10 = 2
+  %result = call i8 @llvm.clmul.i8(i8 1, i8 2)
+  ret i8 %result
+}
+
+define i8 @clmul_i8_manual_check2() {
+; CHECK-LABEL: @clmul_i8_manual_check2(
+; CHECK-NEXT:    ret i8 6
+;
+  ; clmul(2, 3) = clmul(0b10, 0b11) = 0b110 = 6
+  %result = call i8 @llvm.clmul.i8(i8 2, i8 3)
+  ret i8 %result
+}
+
+define i8 @clmul_i8_manual_check3() {
+; CHECK-LABEL: @clmul_i8_manual_check3(
+; CHECK-NEXT:    ret i8 -64
+;
+  ; clmul(12, 12) = clmul(0b1100, 0b1100) = 0b11000000 = 192 = -64 (i8)
+  %result = call i8 @llvm.clmul.i8(i8 12, i8 12)
+  ret i8 %result
+}
\ No newline at end of file