Skip to content

Commit 3ee2152

Browse files
committed
Implement efficient memset for small sizes using DUP instruction
For small non-zero memset operations (4 and 8 bytes), generate a vector splat pattern instead of the MUL-by-0x010101 pattern. This enables efficient lowering to NEON DUP instruction on AArch64: - memset_4: dup v0.8b, w1; str s0, [x0] - memset_8: dup v0.8b, w1; str d0, [x0] - memset_16: dup v0.16b, w1; str q0, [x0] This matches GCC's output and is more efficient than the previous MUL pattern. Fixes #165949 Signed-off-by: Osama Abdelkader <[email protected]>
1 parent 269f264 commit 3ee2152

File tree

2 files changed

+66
-31
lines changed

2 files changed

+66
-31
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8543,6 +8543,17 @@ static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
85438543
if (!IntVT.isInteger())
85448544
IntVT = EVT::getIntegerVT(*DAG.getContext(), IntVT.getSizeInBits());
85458545

8546+
// For repeated-byte patterns, generate a vector splat instead of MUL to enable
8547+
// efficient lowering to DUP on targets like AArch64.
8548+
if (NumBits > 8 && VT.isInteger() && !VT.isVector() &&
8549+
(NumBits == 32 || NumBits == 64)) {
8550+
// Generate a vector of bytes: v4i8 for i32, v8i8 for i64
8551+
EVT ByteVecTy = EVT::getVectorVT(*DAG.getContext(), MVT::i8, NumBits / 8);
8552+
SDValue VecSplat = DAG.getSplatBuildVector(ByteVecTy, dl, Value);
8553+
// Bitcast back to the target integer type
8554+
return DAG.getNode(ISD::BITCAST, dl, IntVT, VecSplat);
8555+
}
8556+
85468557
Value = DAG.getNode(ISD::ZERO_EXTEND, dl, IntVT, Value);
85478558
if (NumBits > 8) {
85488559
// Use a multiplication with 0x010101... to extend the input to the

llvm/test/CodeGen/AArch64/memset-inline.ll

Lines changed: 55 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -27,39 +27,57 @@ define void @memset_2(ptr %a, i8 %value) nounwind {
2727
}
2828

2929
define void @memset_4(ptr %a, i8 %value) nounwind {
30-
; ALL-LABEL: memset_4:
31-
; ALL: // %bb.0:
32-
; ALL-NEXT: mov w8, #16843009
33-
; ALL-NEXT: and w9, w1, #0xff
34-
; ALL-NEXT: mul w8, w9, w8
35-
; ALL-NEXT: str w8, [x0]
36-
; ALL-NEXT: ret
30+
; GPR-LABEL: memset_4:
31+
; GPR: // %bb.0:
32+
; GPR-NEXT: mov w8, #16843009
33+
; GPR-NEXT: and w9, w1, #0xff
34+
; GPR-NEXT: mul w8, w9, w8
35+
; GPR-NEXT: str w8, [x0]
36+
; GPR-NEXT: ret
37+
;
38+
; NEON-LABEL: memset_4:
39+
; NEON: // %bb.0:
40+
; NEON-NEXT: dup v0.8b, w1
41+
; NEON-NEXT: str s0, [x0]
42+
; NEON-NEXT: ret
3743
tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 4, i1 0)
3844
ret void
3945
}
4046

4147
define void @memset_8(ptr %a, i8 %value) nounwind {
42-
; ALL-LABEL: memset_8:
43-
; ALL: // %bb.0:
44-
; ALL-NEXT: // kill: def $w1 killed $w1 def $x1
45-
; ALL-NEXT: mov x8, #72340172838076673
46-
; ALL-NEXT: and x9, x1, #0xff
47-
; ALL-NEXT: mul x8, x9, x8
48-
; ALL-NEXT: str x8, [x0]
49-
; ALL-NEXT: ret
48+
; GPR-LABEL: memset_8:
49+
; GPR: // %bb.0:
50+
; GPR-NEXT: // kill: def $w1 killed $w1 def $x1
51+
; GPR-NEXT: mov x8, #72340172838076673
52+
; GPR-NEXT: and x9, x1, #0xff
53+
; GPR-NEXT: mul x8, x9, x8
54+
; GPR-NEXT: str x8, [x0]
55+
; GPR-NEXT: ret
56+
;
57+
; NEON-LABEL: memset_8:
58+
; NEON: // %bb.0:
59+
; NEON-NEXT: dup v0.8b, w1
60+
; NEON-NEXT: str d0, [x0]
61+
; NEON-NEXT: ret
5062
tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 8, i1 0)
5163
ret void
5264
}
5365

5466
define void @memset_16(ptr %a, i8 %value) nounwind {
55-
; ALL-LABEL: memset_16:
56-
; ALL: // %bb.0:
57-
; ALL-NEXT: // kill: def $w1 killed $w1 def $x1
58-
; ALL-NEXT: mov x8, #72340172838076673
59-
; ALL-NEXT: and x9, x1, #0xff
60-
; ALL-NEXT: mul x8, x9, x8
61-
; ALL-NEXT: stp x8, x8, [x0]
62-
; ALL-NEXT: ret
67+
; GPR-LABEL: memset_16:
68+
; GPR: // %bb.0:
69+
; GPR-NEXT: // kill: def $w1 killed $w1 def $x1
70+
; GPR-NEXT: mov x8, #72340172838076673
71+
; GPR-NEXT: and x9, x1, #0xff
72+
; GPR-NEXT: mul x8, x9, x8
73+
; GPR-NEXT: stp x8, x8, [x0]
74+
; GPR-NEXT: ret
75+
;
76+
; NEON-LABEL: memset_16:
77+
; NEON: // %bb.0:
78+
; NEON-NEXT: dup v0.16b, w1
79+
; NEON-NEXT: str q0, [x0]
80+
; NEON-NEXT: ret
6381
tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 16, i1 0)
6482
ret void
6583
}
@@ -110,14 +128,20 @@ define void @memset_64(ptr %a, i8 %value) nounwind {
110128
; /////////////////////////////////////////////////////////////////////////////
111129

112130
define void @aligned_memset_16(ptr align 16 %a, i8 %value) nounwind {
113-
; ALL-LABEL: aligned_memset_16:
114-
; ALL: // %bb.0:
115-
; ALL-NEXT: // kill: def $w1 killed $w1 def $x1
116-
; ALL-NEXT: mov x8, #72340172838076673
117-
; ALL-NEXT: and x9, x1, #0xff
118-
; ALL-NEXT: mul x8, x9, x8
119-
; ALL-NEXT: stp x8, x8, [x0]
120-
; ALL-NEXT: ret
131+
; GPR-LABEL: aligned_memset_16:
132+
; GPR: // %bb.0:
133+
; GPR-NEXT: // kill: def $w1 killed $w1 def $x1
134+
; GPR-NEXT: mov x8, #72340172838076673
135+
; GPR-NEXT: and x9, x1, #0xff
136+
; GPR-NEXT: mul x8, x9, x8
137+
; GPR-NEXT: stp x8, x8, [x0]
138+
; GPR-NEXT: ret
139+
;
140+
; NEON-LABEL: aligned_memset_16:
141+
; NEON: // %bb.0:
142+
; NEON-NEXT: dup v0.16b, w1
143+
; NEON-NEXT: str q0, [x0]
144+
; NEON-NEXT: ret
121145
tail call void @llvm.memset.inline.p0.i64(ptr align 16 %a, i8 %value, i64 16, i1 0)
122146
ret void
123147
}

0 commit comments

Comments
 (0)