1313
1414define void @memset_1 (ptr %a , i128 %value ) nounwind {
1515; RV32-BOTH-LABEL: memset_1:
16- ; RV32-BOTH: # %bb.0: # %storeloop .preheader
16+ ; RV32-BOTH: # %bb.0: # %loadstoreloop .preheader
1717; RV32-BOTH-NEXT: lw a2, 0(a1)
1818; RV32-BOTH-NEXT: lw a3, 4(a1)
1919; RV32-BOTH-NEXT: lw a4, 8(a1)
2020; RV32-BOTH-NEXT: lw a1, 12(a1)
21- ; RV32-BOTH-NEXT: addi a5, a0, 16
22- ; RV32-BOTH-NEXT: .LBB0_1: # %storeloop
21+ ; RV32-BOTH-NEXT: li a5, 0
22+ ; RV32-BOTH-NEXT: li a6, 0
23+ ; RV32-BOTH-NEXT: .LBB0_1: # %loadstoreloop
2324; RV32-BOTH-NEXT: # =>This Inner Loop Header: Depth=1
24- ; RV32-BOTH-NEXT: sw a2, 0(a0)
25- ; RV32-BOTH-NEXT: sw a3, 4(a0)
26- ; RV32-BOTH-NEXT: sw a4, 8(a0)
27- ; RV32-BOTH-NEXT: sw a1, 12(a0)
28- ; RV32-BOTH-NEXT: addi a0, a0, 16
29- ; RV32-BOTH-NEXT: bne a0, a5, .LBB0_1
25+ ; RV32-BOTH-NEXT: slli a7, a5, 4
26+ ; RV32-BOTH-NEXT: add a7, a0, a7
27+ ; RV32-BOTH-NEXT: addi a5, a5, 1
28+ ; RV32-BOTH-NEXT: seqz t0, a5
29+ ; RV32-BOTH-NEXT: add a6, a6, t0
30+ ; RV32-BOTH-NEXT: or t0, a5, a6
31+ ; RV32-BOTH-NEXT: sw a2, 0(a7)
32+ ; RV32-BOTH-NEXT: sw a3, 4(a7)
33+ ; RV32-BOTH-NEXT: sw a4, 8(a7)
34+ ; RV32-BOTH-NEXT: sw a1, 12(a7)
35+ ; RV32-BOTH-NEXT: beqz t0, .LBB0_1
3036; RV32-BOTH-NEXT: # %bb.2: # %split
3137; RV32-BOTH-NEXT: ret
3238;
3339; RV64-BOTH-LABEL: memset_1:
34- ; RV64-BOTH: # %bb.0: # %storeloop .preheader
40+ ; RV64-BOTH: # %bb.0: # %loadstoreloop .preheader
3541; RV64-BOTH-NEXT: addi a3, a0, 16
36- ; RV64-BOTH-NEXT: .LBB0_1: # %storeloop
42+ ; RV64-BOTH-NEXT: .LBB0_1: # %loadstoreloop
3743; RV64-BOTH-NEXT: # =>This Inner Loop Header: Depth=1
3844; RV64-BOTH-NEXT: sd a1, 0(a0)
3945; RV64-BOTH-NEXT: sd a2, 8(a0)
@@ -47,57 +53,69 @@ define void @memset_1(ptr %a, i128 %value) nounwind {
4753
4854define void @memset_1_noalign (ptr %a , i128 %value ) nounwind {
4955; RV32-LABEL: memset_1_noalign:
50- ; RV32: # %bb.0: # %storeloop.preheader
51- ; RV32-NEXT: addi sp, sp, -16
52- ; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
53- ; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
54- ; RV32-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
55- ; RV32-NEXT: lw a2, 0(a1)
56- ; RV32-NEXT: lw a3, 4(a1)
57- ; RV32-NEXT: lw a4, 8(a1)
56+ ; RV32: # %bb.0: # %loadstoreloop.preheader
57+ ; RV32-NEXT: addi sp, sp, -32
58+ ; RV32-NEXT: sw s0, 28(sp) # 4-byte Folded Spill
59+ ; RV32-NEXT: sw s1, 24(sp) # 4-byte Folded Spill
60+ ; RV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill
61+ ; RV32-NEXT: sw s3, 16(sp) # 4-byte Folded Spill
62+ ; RV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill
63+ ; RV32-NEXT: sw s5, 8(sp) # 4-byte Folded Spill
64+ ; RV32-NEXT: li a2, 0
65+ ; RV32-NEXT: li a3, 0
66+ ; RV32-NEXT: lw a4, 4(a1)
67+ ; RV32-NEXT: lw a5, 0(a1)
68+ ; RV32-NEXT: lw a6, 8(a1)
5869; RV32-NEXT: lw a1, 12(a1)
59- ; RV32-NEXT: addi a5, a0, 16
60- ; RV32-NEXT: srli a6, a2, 24
61- ; RV32-NEXT: srli a7, a2, 16
62- ; RV32-NEXT: srli t0, a2, 8
63- ; RV32-NEXT: srli t1, a3, 24
64- ; RV32-NEXT: srli t2, a3, 16
65- ; RV32-NEXT: srli t3, a3, 8
66- ; RV32-NEXT: srli t4, a4, 24
67- ; RV32-NEXT: srli t5, a4, 16
68- ; RV32-NEXT: srli t6, a4, 8
69- ; RV32-NEXT: srli s0, a1, 24
70- ; RV32-NEXT: srli s1, a1, 16
71- ; RV32-NEXT: srli s2, a1, 8
72- ; RV32-NEXT: .LBB1_1: # %storeloop
70+ ; RV32-NEXT: srli a7, a4, 24
71+ ; RV32-NEXT: srli t0, a4, 16
72+ ; RV32-NEXT: srli t1, a4, 8
73+ ; RV32-NEXT: srli t2, a5, 24
74+ ; RV32-NEXT: srli t3, a5, 16
75+ ; RV32-NEXT: srli t4, a5, 8
76+ ; RV32-NEXT: srli t5, a6, 24
77+ ; RV32-NEXT: srli t6, a6, 16
78+ ; RV32-NEXT: srli s0, a6, 8
79+ ; RV32-NEXT: srli s1, a1, 24
80+ ; RV32-NEXT: srli s2, a1, 16
81+ ; RV32-NEXT: srli s3, a1, 8
82+ ; RV32-NEXT: .LBB1_1: # %loadstoreloop
7383; RV32-NEXT: # =>This Inner Loop Header: Depth=1
74- ; RV32-NEXT: sb a2, 0(a0)
75- ; RV32-NEXT: sb t0, 1(a0)
76- ; RV32-NEXT: sb a7, 2(a0)
77- ; RV32-NEXT: sb a6, 3(a0)
78- ; RV32-NEXT: sb a3, 4(a0)
79- ; RV32-NEXT: sb t3, 5(a0)
80- ; RV32-NEXT: sb t2, 6(a0)
81- ; RV32-NEXT: sb t1, 7(a0)
82- ; RV32-NEXT: sb a4, 8(a0)
83- ; RV32-NEXT: sb t6, 9(a0)
84- ; RV32-NEXT: sb t5, 10(a0)
85- ; RV32-NEXT: sb t4, 11(a0)
86- ; RV32-NEXT: sb a1, 12(a0)
87- ; RV32-NEXT: sb s2, 13(a0)
88- ; RV32-NEXT: sb s1, 14(a0)
89- ; RV32-NEXT: sb s0, 15(a0)
90- ; RV32-NEXT: addi a0, a0, 16
91- ; RV32-NEXT: bne a0, a5, .LBB1_1
84+ ; RV32-NEXT: slli s4, a2, 4
85+ ; RV32-NEXT: add s4, a0, s4
86+ ; RV32-NEXT: sb a4, 4(s4)
87+ ; RV32-NEXT: sb t1, 5(s4)
88+ ; RV32-NEXT: sb t0, 6(s4)
89+ ; RV32-NEXT: sb a7, 7(s4)
90+ ; RV32-NEXT: sb a5, 0(s4)
91+ ; RV32-NEXT: sb t4, 1(s4)
92+ ; RV32-NEXT: sb t3, 2(s4)
93+ ; RV32-NEXT: sb t2, 3(s4)
94+ ; RV32-NEXT: sb a6, 8(s4)
95+ ; RV32-NEXT: sb s0, 9(s4)
96+ ; RV32-NEXT: sb t6, 10(s4)
97+ ; RV32-NEXT: sb t5, 11(s4)
98+ ; RV32-NEXT: addi a2, a2, 1
99+ ; RV32-NEXT: seqz s5, a2
100+ ; RV32-NEXT: add a3, a3, s5
101+ ; RV32-NEXT: or s5, a2, a3
102+ ; RV32-NEXT: sb a1, 12(s4)
103+ ; RV32-NEXT: sb s3, 13(s4)
104+ ; RV32-NEXT: sb s2, 14(s4)
105+ ; RV32-NEXT: sb s1, 15(s4)
106+ ; RV32-NEXT: beqz s5, .LBB1_1
92107; RV32-NEXT: # %bb.2: # %split
93- ; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
94- ; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
95- ; RV32-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
96- ; RV32-NEXT: addi sp, sp, 16
108+ ; RV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload
109+ ; RV32-NEXT: lw s1, 24(sp) # 4-byte Folded Reload
110+ ; RV32-NEXT: lw s2, 20(sp) # 4-byte Folded Reload
111+ ; RV32-NEXT: lw s3, 16(sp) # 4-byte Folded Reload
112+ ; RV32-NEXT: lw s4, 12(sp) # 4-byte Folded Reload
113+ ; RV32-NEXT: lw s5, 8(sp) # 4-byte Folded Reload
114+ ; RV32-NEXT: addi sp, sp, 32
97115; RV32-NEXT: ret
98116;
99117; RV64-LABEL: memset_1_noalign:
100- ; RV64: # %bb.0: # %storeloop .preheader
118+ ; RV64: # %bb.0: # %loadstoreloop .preheader
101119; RV64-NEXT: addi sp, sp, -32
102120; RV64-NEXT: sd s0, 24(sp) # 8-byte Folded Spill
103121; RV64-NEXT: sd s1, 16(sp) # 8-byte Folded Spill
@@ -117,7 +135,7 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
117135; RV64-NEXT: srli s0, a2, 24
118136; RV64-NEXT: srli s1, a2, 16
119137; RV64-NEXT: srli s2, a2, 8
120- ; RV64-NEXT: .LBB1_1: # %storeloop
138+ ; RV64-NEXT: .LBB1_1: # %loadstoreloop
121139; RV64-NEXT: # =>This Inner Loop Header: Depth=1
122140; RV64-NEXT: sb a7, 4(a0)
123141; RV64-NEXT: sb a6, 5(a0)
@@ -145,27 +163,33 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
145163; RV64-NEXT: ret
146164;
147165; RV32-FAST-LABEL: memset_1_noalign:
148- ; RV32-FAST: # %bb.0: # %storeloop .preheader
166+ ; RV32-FAST: # %bb.0: # %loadstoreloop .preheader
149167; RV32-FAST-NEXT: lw a2, 0(a1)
150168; RV32-FAST-NEXT: lw a3, 4(a1)
151169; RV32-FAST-NEXT: lw a4, 8(a1)
152170; RV32-FAST-NEXT: lw a1, 12(a1)
153- ; RV32-FAST-NEXT: addi a5, a0, 16
154- ; RV32-FAST-NEXT: .LBB1_1: # %storeloop
171+ ; RV32-FAST-NEXT: li a5, 0
172+ ; RV32-FAST-NEXT: li a6, 0
173+ ; RV32-FAST-NEXT: .LBB1_1: # %loadstoreloop
155174; RV32-FAST-NEXT: # =>This Inner Loop Header: Depth=1
156- ; RV32-FAST-NEXT: sw a2, 0(a0)
157- ; RV32-FAST-NEXT: sw a3, 4(a0)
158- ; RV32-FAST-NEXT: sw a4, 8(a0)
159- ; RV32-FAST-NEXT: sw a1, 12(a0)
160- ; RV32-FAST-NEXT: addi a0, a0, 16
161- ; RV32-FAST-NEXT: bne a0, a5, .LBB1_1
175+ ; RV32-FAST-NEXT: slli a7, a5, 4
176+ ; RV32-FAST-NEXT: add a7, a0, a7
177+ ; RV32-FAST-NEXT: addi a5, a5, 1
178+ ; RV32-FAST-NEXT: seqz t0, a5
179+ ; RV32-FAST-NEXT: add a6, a6, t0
180+ ; RV32-FAST-NEXT: or t0, a5, a6
181+ ; RV32-FAST-NEXT: sw a2, 0(a7)
182+ ; RV32-FAST-NEXT: sw a3, 4(a7)
183+ ; RV32-FAST-NEXT: sw a4, 8(a7)
184+ ; RV32-FAST-NEXT: sw a1, 12(a7)
185+ ; RV32-FAST-NEXT: beqz t0, .LBB1_1
162186; RV32-FAST-NEXT: # %bb.2: # %split
163187; RV32-FAST-NEXT: ret
164188;
165189; RV64-FAST-LABEL: memset_1_noalign:
166- ; RV64-FAST: # %bb.0: # %storeloop .preheader
190+ ; RV64-FAST: # %bb.0: # %loadstoreloop .preheader
167191; RV64-FAST-NEXT: addi a3, a0, 16
168- ; RV64-FAST-NEXT: .LBB1_1: # %storeloop
192+ ; RV64-FAST-NEXT: .LBB1_1: # %loadstoreloop
169193; RV64-FAST-NEXT: # =>This Inner Loop Header: Depth=1
170194; RV64-FAST-NEXT: sd a1, 0(a0)
171195; RV64-FAST-NEXT: sd a2, 8(a0)
@@ -179,27 +203,35 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
179203
180204define void @memset_4 (ptr %a , i128 %value ) nounwind {
181205; RV32-BOTH-LABEL: memset_4:
182- ; RV32-BOTH: # %bb.0: # %storeloop .preheader
206+ ; RV32-BOTH: # %bb.0: # %loadstoreloop .preheader
183207; RV32-BOTH-NEXT: lw a2, 0(a1)
184208; RV32-BOTH-NEXT: lw a3, 4(a1)
185209; RV32-BOTH-NEXT: lw a4, 8(a1)
186210; RV32-BOTH-NEXT: lw a1, 12(a1)
187- ; RV32-BOTH-NEXT: addi a5, a0, 64
188- ; RV32-BOTH-NEXT: .LBB2_1: # %storeloop
211+ ; RV32-BOTH-NEXT: li a5, 0
212+ ; RV32-BOTH-NEXT: li a6, 0
213+ ; RV32-BOTH-NEXT: .LBB2_1: # %loadstoreloop
189214; RV32-BOTH-NEXT: # =>This Inner Loop Header: Depth=1
190- ; RV32-BOTH-NEXT: sw a2, 0(a0)
191- ; RV32-BOTH-NEXT: sw a3, 4(a0)
192- ; RV32-BOTH-NEXT: sw a4, 8(a0)
193- ; RV32-BOTH-NEXT: sw a1, 12(a0)
194- ; RV32-BOTH-NEXT: addi a0, a0, 16
195- ; RV32-BOTH-NEXT: bne a0, a5, .LBB2_1
215+ ; RV32-BOTH-NEXT: slli a7, a5, 4
216+ ; RV32-BOTH-NEXT: add a7, a0, a7
217+ ; RV32-BOTH-NEXT: addi a5, a5, 1
218+ ; RV32-BOTH-NEXT: seqz t0, a5
219+ ; RV32-BOTH-NEXT: add a6, a6, t0
220+ ; RV32-BOTH-NEXT: seqz t0, a6
221+ ; RV32-BOTH-NEXT: sltiu t1, a5, 4
222+ ; RV32-BOTH-NEXT: and t0, t0, t1
223+ ; RV32-BOTH-NEXT: sw a2, 0(a7)
224+ ; RV32-BOTH-NEXT: sw a3, 4(a7)
225+ ; RV32-BOTH-NEXT: sw a4, 8(a7)
226+ ; RV32-BOTH-NEXT: sw a1, 12(a7)
227+ ; RV32-BOTH-NEXT: bnez t0, .LBB2_1
196228; RV32-BOTH-NEXT: # %bb.2: # %split
197229; RV32-BOTH-NEXT: ret
198230;
199231; RV64-BOTH-LABEL: memset_4:
200- ; RV64-BOTH: # %bb.0: # %storeloop .preheader
232+ ; RV64-BOTH: # %bb.0: # %loadstoreloop .preheader
201233; RV64-BOTH-NEXT: addi a3, a0, 64
202- ; RV64-BOTH-NEXT: .LBB2_1: # %storeloop
234+ ; RV64-BOTH-NEXT: .LBB2_1: # %loadstoreloop
203235; RV64-BOTH-NEXT: # =>This Inner Loop Header: Depth=1
204236; RV64-BOTH-NEXT: sd a1, 0(a0)
205237; RV64-BOTH-NEXT: sd a2, 8(a0)
@@ -214,38 +246,50 @@ define void @memset_4(ptr %a, i128 %value) nounwind {
214246define void @memset_x (ptr %a , i128 %value , i64 %x ) nounwind {
215247; RV32-BOTH-LABEL: memset_x:
216248; RV32-BOTH: # %bb.0:
217- ; RV32-BOTH-NEXT: or a3 , a2, a3
218- ; RV32-BOTH-NEXT: beqz a3 , .LBB3_3
219- ; RV32-BOTH-NEXT: # %bb.1: # %storeloop .preheader
220- ; RV32-BOTH-NEXT: lw a3 , 0(a1)
221- ; RV32-BOTH-NEXT: lw a4 , 4(a1)
222- ; RV32-BOTH-NEXT: lw a5 , 8(a1)
249+ ; RV32-BOTH-NEXT: or a4 , a2, a3
250+ ; RV32-BOTH-NEXT: beqz a4 , .LBB3_5
251+ ; RV32-BOTH-NEXT: # %bb.1: # %loadstoreloop .preheader
252+ ; RV32-BOTH-NEXT: lw a4 , 0(a1)
253+ ; RV32-BOTH-NEXT: lw a5 , 4(a1)
254+ ; RV32-BOTH-NEXT: lw a6 , 8(a1)
223255; RV32-BOTH-NEXT: lw a1, 12(a1)
224- ; RV32-BOTH-NEXT: slli a2, a2, 4
225- ; RV32-BOTH-NEXT: add a2, a0, a2
226- ; RV32-BOTH-NEXT: .LBB3_2: # %storeloop
256+ ; RV32-BOTH-NEXT: li a7, 0
257+ ; RV32-BOTH-NEXT: li t0, 0
258+ ; RV32-BOTH-NEXT: j .LBB3_3
259+ ; RV32-BOTH-NEXT: .LBB3_2: # %loadstoreloop
260+ ; RV32-BOTH-NEXT: # in Loop: Header=BB3_3 Depth=1
261+ ; RV32-BOTH-NEXT: sltu t1, t0, a3
262+ ; RV32-BOTH-NEXT: beqz t1, .LBB3_5
263+ ; RV32-BOTH-NEXT: .LBB3_3: # %loadstoreloop
227264; RV32-BOTH-NEXT: # =>This Inner Loop Header: Depth=1
228- ; RV32-BOTH-NEXT: sw a3, 0(a0)
229- ; RV32-BOTH-NEXT: sw a4, 4(a0)
230- ; RV32-BOTH-NEXT: sw a5, 8(a0)
231- ; RV32-BOTH-NEXT: sw a1, 12(a0)
232- ; RV32-BOTH-NEXT: addi a0, a0, 16
233- ; RV32-BOTH-NEXT: bne a0, a2, .LBB3_2
234- ; RV32-BOTH-NEXT: .LBB3_3: # %split
265+ ; RV32-BOTH-NEXT: slli t1, a7, 4
266+ ; RV32-BOTH-NEXT: add t1, a0, t1
267+ ; RV32-BOTH-NEXT: addi a7, a7, 1
268+ ; RV32-BOTH-NEXT: seqz t2, a7
269+ ; RV32-BOTH-NEXT: add t0, t0, t2
270+ ; RV32-BOTH-NEXT: sw a4, 0(t1)
271+ ; RV32-BOTH-NEXT: sw a5, 4(t1)
272+ ; RV32-BOTH-NEXT: sw a6, 8(t1)
273+ ; RV32-BOTH-NEXT: sw a1, 12(t1)
274+ ; RV32-BOTH-NEXT: bne t0, a3, .LBB3_2
275+ ; RV32-BOTH-NEXT: # %bb.4: # in Loop: Header=BB3_3 Depth=1
276+ ; RV32-BOTH-NEXT: sltu t1, a7, a2
277+ ; RV32-BOTH-NEXT: bnez t1, .LBB3_3
278+ ; RV32-BOTH-NEXT: .LBB3_5: # %split
235279; RV32-BOTH-NEXT: ret
236280;
237281; RV64-BOTH-LABEL: memset_x:
238282; RV64-BOTH: # %bb.0:
239283; RV64-BOTH-NEXT: beqz a3, .LBB3_3
240- ; RV64-BOTH-NEXT: # %bb.1: # %storeloop.preheader
241- ; RV64-BOTH-NEXT: slli a3, a3, 4
242- ; RV64-BOTH-NEXT: add a3, a0, a3
243- ; RV64-BOTH-NEXT: .LBB3_2: # %storeloop
284+ ; RV64-BOTH-NEXT: # %bb.1: # %loadstoreloop.preheader
285+ ; RV64-BOTH-NEXT: li a4, 0
286+ ; RV64-BOTH-NEXT: .LBB3_2: # %loadstoreloop
244287; RV64-BOTH-NEXT: # =>This Inner Loop Header: Depth=1
245288; RV64-BOTH-NEXT: sd a1, 0(a0)
246289; RV64-BOTH-NEXT: sd a2, 8(a0)
290+ ; RV64-BOTH-NEXT: addi a4, a4, 1
247291; RV64-BOTH-NEXT: addi a0, a0, 16
248- ; RV64-BOTH-NEXT: bne a0 , a3, .LBB3_2
292+ ; RV64-BOTH-NEXT: bltu a4 , a3, .LBB3_2
249293; RV64-BOTH-NEXT: .LBB3_3: # %split
250294; RV64-BOTH-NEXT: ret
251295 tail call void @llvm.memset.pattern (ptr align 8 %a , i128 %value , i64 %x , i1 0 )
0 commit comments