Skip to content

Conversation

@svs-quic
Copy link
Contributor

This patch adds support for converting memset calls to one or more QC_SETWMI instructions when beneficial. We only handle aligned memset calls for now.

We limit a QC_SETWMI to 16 words or less to improve interruptibility.
So for 1-16 words we use a single QC_SETWMI:

QC_SETWMI reg1, N, 0(reg2)

For 17-32 words we use two QC_SETWMI's with the first as 16 words and the second for the remainder:

QC_SETWMI reg1, 16, 0(reg2)
QC_SETWMI reg1, N, 64(reg2)

For 33-48 words, we would like to use (16, 16, n), but that means the last QC_SETWMI needs an offset of 128 which the instruction doesn't support. So in this case we use a length of 15 for the second instruction and we do the rest with the third instruction.

This means the maximum number of words handled is 47 (for now):

QC_SETWMI R2, R0, 16, 0
QC_SETWMI R2, R0, 15, 64
QC_SETWMI R2, R0, N, 124

For 48 words or more, call the target independent memset.

@llvmbot
Copy link
Member

llvmbot commented Jul 31, 2025

@llvm/pr-subscribers-backend-risc-v

Author: Sudharsan Veeravalli (svs-quic)

Changes

This patch adds support for converting memset calls to one or more QC_SETWMI instructions when beneficial. We only handle aligned memset calls for now.

We limit a QC_SETWMI to 16 words or less to improve interruptibility.
So for 1-16 words we use a single QC_SETWMI:

QC_SETWMI reg1, N, 0(reg2)

For 17-32 words we use two QC_SETWMI's with the first as 16 words and the second for the remainder:

QC_SETWMI reg1, 16, 0(reg2)
QC_SETWMI reg1, N, 64(reg2)

For 33-48 words, we would like to use (16, 16, n), but that means the last QC_SETWMI needs an offset of 128 which the instruction doesn't support. So in this case we use a length of 15 for the second instruction and we do the rest with the third instruction.

This means the maximum number of words handled is 47 (for now):

QC_SETWMI R2, R0, 16, 0
QC_SETWMI R2, R0, 15, 64
QC_SETWMI R2, R0, N, 124

For 48 words or more, call the target independent memset.


Patch is 36.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151555.diff

5 Files Affected:

  • (modified) llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp (+9)
  • (modified) llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td (+8)
  • (modified) llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp (+101)
  • (modified) llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.h (+6)
  • (added) llvm/test/CodeGen/RISCV/xqcilsm-memset.ll (+929)
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index f223fdbef4359..b778c33083685 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1845,6 +1845,15 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     CurDAG->RemoveDeadNode(Node);
     return;
   }
+  case RISCVISD::QC_SETWMI: {
+    SDValue Chain = Node->getOperand(0);
+    SDVTList VTs = Node->getVTList();
+    SDValue Ops[] = {Node->getOperand(1), Node->getOperand(2),
+                     Node->getOperand(3), Node->getOperand(4), Chain};
+    MachineSDNode *New = CurDAG->getMachineNode(RISCV::QC_SETWMI, DL, VTs, Ops);
+    ReplaceNode(Node, New);
+    return;
+  }
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IntNo = Node->getConstantOperandVal(0);
     switch (IntNo) {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 52656134b7774..2479ced164927 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -14,6 +14,14 @@
 // Operand and SDNode transformation definitions.
 //===----------------------------------------------------------------------===//
 
+def SDT_StoreMultiple : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<1, 3>,
+                                             SDTCisPtrTy<2>,
+                                             SDTCisVT<3, XLenVT>]>;
+
+def qc_setwmi : RVSDNode<"QC_SETWMI", SDT_StoreMultiple,
+                         [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
 def uimm5nonzero : RISCVOp<XLenVT>,
                    ImmLeaf<XLenVT, [{return (Imm != 0) && isUInt<5>(Imm);}]> {
   let ParserMatchClass = UImmAsmOperand<5, "NonZero">;
diff --git a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp
index 6ecddad72c078..edfa2992711a0 100644
--- a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVSelectionDAGInfo.h"
+#include "RISCVSubtarget.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 
 #define GET_SDNODE_DESC
 #include "RISCVGenSDNodeInfo.inc"
@@ -62,3 +64,102 @@ void RISCVSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
   }
 #endif
 }
+
+SDValue RISCVSelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo) const {
+  const RISCVSubtarget &Subtarget =
+      DAG.getMachineFunction().getSubtarget<RISCVSubtarget>();
+  // We currently do this only for Xqcilsm
+  if (!Subtarget.hasVendorXqcilsm())
+    return SDValue();
+
+  // Do this only if we know the size at compile time.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDValue();
+
+  uint64_t NumberOfBytesToWrite = ConstantSize->getZExtValue();
+
+  // Do this only if it is word aligned and we write multiple of 4 bytes.
+  if (!((Alignment.value() & 3) == 0 && (NumberOfBytesToWrite & 3) == 0))
+    return SDValue();
+
+  SmallVector<SDValue, 8> OutChains;
+  SDValue SizeWords, OffsetSetwmi;
+  SDValue SrcValueReplicated = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
+  int NumberOfWords = NumberOfBytesToWrite / 4;
+
+  // Helper for constructing the QC_SETWMI instruction
+  auto getSetwmiNode = [&](SDValue SizeWords, SDValue OffsetSetwmi) -> SDValue {
+    SDValue Ops[] = {Chain, SrcValueReplicated, Dst, SizeWords, OffsetSetwmi};
+    return DAG.getNode(RISCVISD::QC_SETWMI, dl, MVT::Other, Ops);
+  };
+
+  bool IsZeroVal =
+      isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isZero();
+
+  // If i8 type and constant non-zero value.
+  if ((Src.getValueType() == MVT::i8) && !IsZeroVal)
+    // Replicate byte to word by multiplication with 0x01010101.
+    SrcValueReplicated = DAG.getNode(ISD::MUL, dl, MVT::i32, SrcValueReplicated,
+                                     DAG.getConstant(16843009, dl, MVT::i32));
+
+  // We limit a QC_SETWMI to 16 words or less to improve interruptibility.
+  // So for 1-16 words we use a single QC_SETWMI:
+  //
+  // QC_SETWMI reg1, N, 0(reg2)
+  //
+  // For 17-32 words we use two QC_SETWMI's with the first as 16 words and the
+  // second for the remainder:
+  //
+  // QC_SETWMI reg1, 16, 0(reg2)
+  // QC_SETWMI reg1, 32-N, 64(reg2)
+  //
+  // For 33-48 words, we would like to use (16, 16, n), but that means the last
+  // QC_SETWMI needs an offset of 128 which the instruction doesnt support.
+  // So in this case we use a length of 15 for the second instruction and we do
+  // the rest with the third instruction.
+  // This means the maximum inlined number of words is 47 (for now):
+  //
+  // QC_SETWMI R2, R0, 16, 0
+  // QC_SETWMI R2, R0, 15, 64
+  // QC_SETWMI R2, R0, N, 124
+  //
+  // For 48 words or more, call the target independent memset
+  if (NumberOfWords <= 16) {
+    // 1 - 16 words
+    SizeWords = DAG.getTargetConstant(NumberOfWords, dl, MVT::i32);
+    SDValue OffsetSetwmi = DAG.getTargetConstant(0, dl, MVT::i32);
+    return getSetwmiNode(SizeWords, OffsetSetwmi);
+  } else if (NumberOfWords <= 47) {
+    if (NumberOfWords <= 32) {
+      // 17 - 32 words
+      SizeWords = DAG.getTargetConstant(NumberOfWords - 16, dl, MVT::i32);
+      OffsetSetwmi = DAG.getTargetConstant(64, dl, MVT::i32);
+      OutChains.push_back(getSetwmiNode(SizeWords, OffsetSetwmi));
+
+      SizeWords = DAG.getTargetConstant(16, dl, MVT::i32);
+      OffsetSetwmi = DAG.getTargetConstant(0, dl, MVT::i32);
+      OutChains.push_back(getSetwmiNode(SizeWords, OffsetSetwmi));
+    } else {
+      // 33 - 47 words
+      SizeWords = DAG.getTargetConstant(NumberOfWords - 31, dl, MVT::i32);
+      OffsetSetwmi = DAG.getTargetConstant(124, dl, MVT::i32);
+      OutChains.push_back(getSetwmiNode(SizeWords, OffsetSetwmi));
+
+      SizeWords = DAG.getTargetConstant(15, dl, MVT::i32);
+      OffsetSetwmi = DAG.getTargetConstant(64, dl, MVT::i32);
+      OutChains.push_back(getSetwmiNode(SizeWords, OffsetSetwmi));
+
+      SizeWords = DAG.getTargetConstant(16, dl, MVT::i32);
+      OffsetSetwmi = DAG.getTargetConstant(0, dl, MVT::i32);
+      OutChains.push_back(getSetwmiNode(SizeWords, OffsetSetwmi));
+    }
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+  }
+
+  // >= 48 words. Call target independent memset.
+  return SDValue();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.h b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.h
index 641189f8661c1..08c8d11f2b108 100644
--- a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.h
@@ -34,6 +34,12 @@ class RISCVSelectionDAGInfo : public SelectionDAGGenTargetInfo {
   void verifyTargetNode(const SelectionDAG &DAG,
                         const SDNode *N) const override;
 
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, Align Alignment,
+                                  bool isVolatile, bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo) const override;
+
   bool hasPassthruOp(unsigned Opcode) const {
     return GenNodeInfo.getDesc(Opcode).TSFlags & RISCVISD::HasPassthruOpMask;
   }
diff --git a/llvm/test/CodeGen/RISCV/xqcilsm-memset.ll b/llvm/test/CodeGen/RISCV/xqcilsm-memset.ll
new file mode 100644
index 0000000000000..b0107cc1a4e03
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/xqcilsm-memset.ll
@@ -0,0 +1,929 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN: | FileCheck %s -check-prefixes=RV32I
+
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs -mattr=+experimental-xqcilsm < %s \
+; RUN: | FileCheck %s -check-prefixes=RV32IXQCISLS
+
+%struct.anon = type { [16 x i32] }
+%struct.anon.0 = type { [47 x i32] }
+%struct.anon.1 = type { [48 x i32] }
+%struct.anon.2 = type { [64 x i8] }
+%struct.struct1_t = type { [16 x i32] }
+
+@struct1 = common dso_local local_unnamed_addr global %struct.anon zeroinitializer, align 4
+@struct4b = common dso_local local_unnamed_addr global %struct.anon.0 zeroinitializer, align 4
+@struct4b1 = common dso_local local_unnamed_addr global %struct.anon.1 zeroinitializer, align 4
+@struct2 = common dso_local local_unnamed_addr global %struct.anon.2 zeroinitializer, align 1
+@arr1 = common dso_local local_unnamed_addr global [100 x i32] zeroinitializer, align 4
+@struct1_ = common dso_local local_unnamed_addr global %struct.struct1_t zeroinitializer, align 4
+
+define void @test1(ptr nocapture %p, i32 %n) nounwind {
+; RV32I-LABEL: test1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    tail memset
+;
+; RV32IXQCISLS-LABEL: test1:
+; RV32IXQCISLS:       # %bb.0: # %entry
+; RV32IXQCISLS-NEXT:    mv a2, a1
+; RV32IXQCISLS-NEXT:    li a1, 0
+; RV32IXQCISLS-NEXT:    tail memset
+entry:
+  tail call void @llvm.memset.p0.i32(ptr align 1 %p, i8 0, i32 %n, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1)
+
+define void @test2(ptr nocapture %p) nounwind {
+; RV32I-LABEL: test2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 165
+; RV32I-NEXT:    li a2, 128
+; RV32I-NEXT:    tail memset
+;
+; RV32IXQCISLS-LABEL: test2:
+; RV32IXQCISLS:       # %bb.0: # %entry
+; RV32IXQCISLS-NEXT:    lui a1, 678490
+; RV32IXQCISLS-NEXT:    addi a1, a1, 1445
+; RV32IXQCISLS-NEXT:    qc.setwmi a1, 16, 0(a0)
+; RV32IXQCISLS-NEXT:    qc.setwmi a1, 16, 64(a0)
+; RV32IXQCISLS-NEXT:    ret
+entry:
+  tail call void @llvm.memset.p0.i32(ptr align 4 %p, i8 -91, i32 128, i1 false)
+  ret void
+}
+
+define void @test2a(ptr nocapture %p) nounwind {
+; RV32I-LABEL: test2a:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 165
+; RV32I-NEXT:    li a2, 188
+; RV32I-NEXT:    tail memset
+;
+; RV32IXQCISLS-LABEL: test2a:
+; RV32IXQCISLS:       # %bb.0: # %entry
+; RV32IXQCISLS-NEXT:    lui a1, 678490
+; RV32IXQCISLS-NEXT:    addi a1, a1, 1445
+; RV32IXQCISLS-NEXT:    qc.setwmi a1, 16, 0(a0)
+; RV32IXQCISLS-NEXT:    qc.setwmi a1, 15, 64(a0)
+; RV32IXQCISLS-NEXT:    qc.setwmi a1, 16, 124(a0)
+; RV32IXQCISLS-NEXT:    ret
+entry:
+  tail call void @llvm.memset.p0.i32(ptr align 4 %p, i8 -91, i32 188, i1 false)
+  ret void
+}
+
+define void @test2b(ptr nocapture %p) nounwind {
+; RV32I-LABEL: test2b:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 165
+; RV32I-NEXT:    li a2, 192
+; RV32I-NEXT:    tail memset
+;
+; RV32IXQCISLS-LABEL: test2b:
+; RV32IXQCISLS:       # %bb.0: # %entry
+; RV32IXQCISLS-NEXT:    li a1, 165
+; RV32IXQCISLS-NEXT:    li a2, 192
+; RV32IXQCISLS-NEXT:    tail memset
+entry:
+  tail call void @llvm.memset.p0.i32(ptr align 4 %p, i8 -91, i32 192, i1 false)
+  ret void
+}
+
+define void @test2c(ptr nocapture %p) nounwind {
+; RV32I-LABEL: test2c:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 165
+; RV32I-NEXT:    li a2, 128
+; RV32I-NEXT:    tail memset
+;
+; RV32IXQCISLS-LABEL: test2c:
+; RV32IXQCISLS:       # %bb.0: # %entry
+; RV32IXQCISLS-NEXT:    lui a1, 678490
+; RV32IXQCISLS-NEXT:    addi a1, a1, 1445
+; RV32IXQCISLS-NEXT:    qc.setwmi a1, 16, 0(a0)
+; RV32IXQCISLS-NEXT:    qc.setwmi a1, 16, 64(a0)
+; RV32IXQCISLS-NEXT:    ret
+entry:
+  tail call void @llvm.memset.p0.i32(ptr align 4 %p, i8 -91, i32 128, i1 false)
+  ret void
+}
+
+define void @test2d(ptr nocapture %p) nounwind {
+; RV32I-LABEL: test2d:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, -91
+; RV32I-NEXT:    lui a2, 1048570
+; RV32I-NEXT:    lui a3, 678490
+; RV32I-NEXT:    addi a2, a2, 1445
+; RV32I-NEXT:    addi a3, a3, 1445
+; RV32I-NEXT:    sw a3, 0(a0)
+; RV32I-NEXT:    sw a3, 4(a0)
+; RV32I-NEXT:    sh a2, 8(a0)
+; RV32I-NEXT:    sb a1, 10(a0)
+; RV32I-NEXT:    ret
+;
+; RV32IXQCISLS-LABEL: test2d:
+; RV32IXQCISLS:       # %bb.0: # %entry
+; RV32IXQCISLS-NEXT:    li a1, -91
+; RV32IXQCISLS-NEXT:    lui a2, 1048570
+; RV32IXQCISLS-NEXT:    lui a3, 678490
+; RV32IXQCISLS-NEXT:    addi a2, a2, 1445
+; RV32IXQCISLS-NEXT:    addi a3, a3, 1445
+; RV32IXQCISLS-NEXT:    sw a3, 0(a0)
+; RV32IXQCISLS-NEXT:    sw a3, 4(a0)
+; RV32IXQCISLS-NEXT:    sh a2, 8(a0)
+; RV32IXQCISLS-NEXT:    sb a1, 10(a0)
+; RV32IXQCISLS-NEXT:    ret
+entry:
+  tail call void @llvm.memset.p0.i32(ptr align 4 %p, i8 -91, i32 11, i1 false)
+  ret void
+}
+
+
+define ptr @test3(ptr %p) nounwind {
+; RV32I-LABEL: test3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 256
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    tail memset
+;
+; RV32IXQCISLS-LABEL: test3:
+; RV32IXQCISLS:       # %bb.0: # %entry
+; RV32IXQCISLS-NEXT:    li a2, 256
+; RV32IXQCISLS-NEXT:    li a1, 0
+; RV32IXQCISLS-NEXT:    tail memset
+entry:
+  tail call void @llvm.memset.p0.i32(ptr align 4 %p, i8 0, i32 256, i1 false)
+  ret ptr %p
+}
+
+define ptr @test3a(ptr %p) nounwind {
+; RV32I-LABEL: test3a:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 128
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    tail memset
+;
+; RV32IXQCISLS-LABEL: test3a:
+; RV32IXQCISLS:       # %bb.0: # %entry
+; RV32IXQCISLS-NEXT:    qc.setwmi zero, 16, 0(a0)
+; RV32IXQCISLS-NEXT:    qc.setwmi zero, 16, 64(a0)
+; RV32IXQCISLS-NEXT:    ret
+entry:
+  tail call void @llvm.memset.p0.i32(ptr align 4 %p, i8 0, i32 128, i1 false)
+  ret ptr %p
+}
+
+define void @test4() nounwind {
+; RV32I-LABEL: test4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a0, %hi(struct1)
+; RV32I-NEXT:    addi a0, a0, %lo(struct1)
+; RV32I-NEXT:    li a2, 64
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    tail memset
+;
+; RV32IXQCISLS-LABEL: test4:
+; RV32IXQCISLS:       # %bb.0: # %entry
+; RV32IXQCISLS-NEXT:    lui a0, %hi(struct1)
+; RV32IXQCISLS-NEXT:    addi a0, a0, %lo(struct1)
+; RV32IXQCISLS-NEXT:    qc.setwmi zero, 16, 0(a0)
+; RV32IXQCISLS-NEXT:    ret
+entry:
+  tail call void @llvm.memset.p0.i32(ptr align 4 @struct1, i8 0, i32 64, i1 false)
+  ret void
+}
+
+define void @test4a(ptr nocapture %s) nounwind {
+; RV32I-LABEL: test4a:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 166
+; RV32I-NEXT:    li a2, 64
+; RV32I-NEXT:    tail memset
+;
+; RV32IXQCISLS-LABEL: test4a:
+; RV32IXQCISLS:       # %bb.0: # %entry
+; RV32IXQCISLS-NEXT:    lui a1, 682602
+; RV32IXQCISLS-NEXT:    addi a1, a1, 1702
+; RV32IXQCISLS-NEXT:    qc.setwmi a1, 16, 0(a0)
+; RV32IXQCISLS-NEXT:    ret
+entry:
+  tail call void @llvm.memset.p0.i32(ptr align 4 %s, i8 -90, i32 64, i1 false)
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
+
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
+
+define void @test4b() nounwind {
+; RV32I-LABEL: test4b:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lui a0, %hi(struct4b)
+; RV32I-NEXT:    addi a0, a0, %lo(struct4b)
+; RV32I-NEXT:    li a2, 188
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call memset
+; RV32I-NEXT:    lui a0, %hi(struct4b1)
+; RV32I-NEXT:    addi a0, a0, %lo(struct4b1)
+; RV32I-NEXT:    li a2, 192
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    tail memset
+;
+; RV32IXQCISLS-LABEL: test4b:
+; RV32IXQCISLS:       # %bb.0: # %entry
+; RV32IXQCISLS-NEXT:    lui a1, %hi(struct4b)
+; RV32IXQCISLS-NEXT:    addi a1, a1, %lo(struct4b)
+; RV32IXQCISLS-NEXT:    lui a0, %hi(struct4b1)
+; RV32IXQCISLS-NEXT:    addi a0, a0, %lo(struct4b1)
+; RV32IXQCISLS-NEXT:    li a2, 192
+; RV32IXQCISLS-NEXT:    qc.setwmi zero, 16, 0(a1)
+; RV32IXQCISLS-NEXT:    qc.setwmi zero, 15, 64(a1)
+; RV32IXQCISLS-NEXT:    qc.setwmi zero, 16, 124(a1)
+; RV32IXQCISLS-NEXT:    li a1, 0
+; RV32IXQCISLS-NEXT:    tail memset
+entry:
+  tail call void @llvm.memset.p0.i32(ptr align 4 @struct4b, i8 0, i32 188, i1 false)
+  tail call void @llvm.memset.p0.i32(ptr align 4 @struct4b1, i8 0, i32 192, i1 false)
+  ret void
+}
+
+define void @test5() nounwind {
+; RV32I-LABEL: test5:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a0, %hi(struct2)
+; RV32I-NEXT:    addi a0, a0, %lo(struct2)
+; RV32I-NEXT:    li a2, 64
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    tail memset
+;
+; RV32IXQCISLS-LABEL: test5:
+; RV32IXQCISLS:       # %bb.0: # %entry
+; RV32IXQCISLS-NEXT:    lui a0, %hi(struct2)
+; RV32IXQCISLS-NEXT:    addi a0, a0, %lo(struct2)
+; RV32IXQCISLS-NEXT:    li a2, 64
+; RV32IXQCISLS-NEXT:    li a1, 0
+; RV32IXQCISLS-NEXT:    tail memset
+entry:
+  tail call void @llvm.memset.p0.i32(ptr align 1 @struct2, i8 0, i32 64, i1 false)
+  ret void
+}
+
+define i32 @test6() nounwind {
+; RV32I-LABEL: test6:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    li a0, 0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IXQCISLS-LABEL: test6:
+; RV32IXQCISLS:       # %bb.0: # %entry
+; RV32IXQCISLS-NEXT:    addi sp, sp, -16
+; RV32IXQCISLS-NEXT:    sw zero, 12(sp)
+; RV32IXQCISLS-NEXT:    li a0, 0
+; RV32IXQCISLS-NEXT:    addi sp, sp, 16
+; RV32IXQCISLS-NEXT:    ret
+entry:
+  %x = alloca i32, align 4
+  call void @llvm.memset.p0.i32(ptr align 4 %x, i8 0, i32 4, i1 false)
+  %0 = load i32, ptr %x, align 4
+  ret i32 %0
+}
+
+define i32 @test6a() nounwind {
+; RV32I-LABEL: test6a:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    lw a0, 12(sp)
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IXQCISLS-LABEL: test6a:
+; RV32IXQCISLS:       # %bb.0: # %entry
+; RV32IXQCISLS-NEXT:    addi sp, sp, -16
+; RV32IXQCISLS-NEXT:    sw zero, 12(sp)
+; RV32IXQCISLS-NEXT:    lw a0, 12(sp)
+; RV32IXQCISLS-NEXT:    addi sp, sp, 16
+; RV32IXQCISLS-NEXT:    ret
+entry:
+  %x = alloca i32, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %x)
+  store i32 0, ptr %x, align 4
+  %x.0.x.0. = load volatile i32, ptr %x, align 4
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %x)
+  ret i32 %x.0.x.0.
+}
+
+define zeroext i8 @test6b_c() nounwind {
+; RV32I-LABEL: test6b_c:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sb zero, 12(sp)
+; RV32I-NEXT:    lbu a0, 12(sp)
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IXQCISLS-LABEL: test6b_c:
+; RV32IXQCISLS:       # %bb.0: # %entry
+; RV32IXQCISLS-NEXT:    addi sp, sp, -16
+; RV32IXQCISLS-NEXT:    sb zero, 12(sp)
+; RV32IXQCISLS-NEXT:    lbu a0, 12(sp)
+; RV32IXQCISLS-NEXT:    addi sp, sp, 16
+; RV32IXQCISLS-NEXT:    ret
+entry:
+  %x = alloca i8, align 4
+  call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %x)
+  call void @llvm.memset.p0.i32(ptr nonnull align 4 %x, i8 0, i32 1, i1 false)
+  %x.0.x.0. = load volatile i8, ptr %x, align 4
+  call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %x)
+  ret i8 %x.0.x.0.
+}
+
+define signext i16 @test6b_s() nounwind {
+; RV32I-LABEL: test6b_s:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sh zero, 12(sp)
+; RV32I-NEXT:    lh a0, 12(sp)
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IXQCISLS-LABEL: test6b_s:
+; RV32IXQCISLS:       # %bb.0: # %entry
+; RV32IXQCISLS-NEXT:    addi sp, sp, -16
+; RV32IXQCISLS-NEXT:    sh zero, 12(sp)
+; RV32IXQCISLS-NEXT:    lh a0, 12(sp)
+; RV32IXQCISLS-NEXT:    addi sp, sp, 16
+; RV32IXQCISLS-NEXT:    ret
+entry:
+  %x = alloca i16, align 4
+  call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %x)
+  store i16 0, ptr %x, align 4
+  %x.0.x.0. = load volatile i16, ptr %x, align 4
+  call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %x)
+  ret i16 %x.0.x.0.
+}
+
+define i32 @test6b_l() nounwind {
+; RV32I-LABEL: test6b_l:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    lw a0, 12(sp)
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IXQCISLS-LABEL: test6b_l:
+; RV32IXQCISLS:       # %bb.0: # %entry
+; RV32IXQCISLS-NEXT:    addi sp, sp, -16
+; RV32I...
[truncated]

Copy link
Member

@lenary lenary left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. Just some small nits, and one place where the logic/testing needs an addition.

@github-actions
Copy link

github-actions bot commented Aug 1, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

Copy link
Member

@lenary lenary left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm happy with this, but I think the MachinePointerInfo is not quite being handled correctly. Suggestion inline (which will need reformatting).

Copy link
Member

@lenary lenary left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM. Thanks!

Copy link
Collaborator

@topperc topperc left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@svs-quic svs-quic merged commit c9684e5 into llvm:main Aug 4, 2025
9 checks passed
@svs-quic svs-quic deleted the memset_aligned branch August 4, 2025 07:21
@llvm-ci
Copy link
Collaborator

llvm-ci commented Aug 4, 2025

LLVM Buildbot has detected a new failure on builder mlir-nvidia-gcc7 running on mlir-nvidia while building llvm at step 7 "test-build-check-mlir-build-only-check-mlir".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/116/builds/16511

Here is the relevant piece of the build log for the reference
Step 7 (test-build-check-mlir-build-only-check-mlir) failure: test (failure)
******************** TEST 'MLIR :: Integration/GPU/CUDA/async.mlir' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 1
/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-kernel-outlining  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)'  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-async-region -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary="format=fatbin"  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -async-to-async-runtime -async-runtime-ref-counting  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -convert-async-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-runner    --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_cuda_runtime.so    --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_async_runtime.so    --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_runner_utils.so    --entry-point-result=void -O0  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/FileCheck /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-kernel-outlining
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt '-pass-pipeline=builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)'
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-async-region -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary=format=fatbin
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -async-to-async-runtime -async-runtime-ref-counting
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -convert-async-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-runner --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_cuda_runtime.so --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_async_runtime.so --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_runner_utils.so --entry-point-result=void -O0
# .---command stderr------------
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventSynchronize(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# `-----------------------------
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/FileCheck /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# .---command stderr------------
# | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir:68:12: error: CHECK: expected string not found in input
# |  // CHECK: [84, 84]
# |            ^
# | <stdin>:1:1: note: scanning from here
# | Unranked Memref base@ = 0x56f3efc92250 rank = 1 offset = 0 sizes = [2] strides = [1] data = 
# | ^
# | <stdin>:2:1: note: possible intended match here
# | [42, 42]
# | ^
# | 
# | Input file: <stdin>
# | Check file: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |             1: Unranked Memref base@ = 0x56f3efc92250 rank = 1 offset = 0 sizes = [2] strides = [1] data =  
# | check:68'0     X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
# |             2: [42, 42] 
# | check:68'0     ~~~~~~~~~
# | check:68'1     ?         possible intended match
...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants