Skip to content

Commit e95b7cf

Browse files
lukel97Priyanshu3820
authored andcommitted
[RISCV] Enable rematerialization for scalar loads (llvm#166774)
In some workloads we see an argument passed on the stack where it is loaded, only for it to be immediately spilled to a different slot on the stack and then reloaded from that spill slot later on. We can avoid the unnecessary spill by marking loads as rematerializable and just directly loading from where the argument was originally passed on the stack. TargetTransformInfo::isReMaterializableImpl checks to make sure that any loads are `MI.isDereferenceableInvariantLoad()`, so we should be able to move the load down to the remat site. This gives a 14.8% reduction in spills in 544.nab_r on rva23u64 -O3, and a few other smaller reductions on llvm-test-suite. I didn't find any benchmarks where the number of spills/reloads increased. Related: llvm#165761
1 parent 8f0500b commit e95b7cf

File tree

6 files changed

+174
-9
lines changed

6 files changed

+174
-9
lines changed

llvm/lib/Target/RISCV/RISCVInstrInfo.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -768,7 +768,7 @@ def BGE : BranchCC_rri<0b101, "bge">;
768768
def BLTU : BranchCC_rri<0b110, "bltu">;
769769
def BGEU : BranchCC_rri<0b111, "bgeu">;
770770

771-
let IsSignExtendingOpW = 1, canFoldAsLoad = 1 in {
771+
let IsSignExtendingOpW = 1, canFoldAsLoad = 1, isReMaterializable = 1 in {
772772
def LB : Load_ri<0b000, "lb">, Sched<[WriteLDB, ReadMemBase]>;
773773
def LH : Load_ri<0b001, "lh">, Sched<[WriteLDH, ReadMemBase]>;
774774
def LW : Load_ri<0b010, "lw">, Sched<[WriteLDW, ReadMemBase]>;
@@ -889,7 +889,7 @@ def CSRRCI : CSR_ii<0b111, "csrrci">;
889889
/// RV64I instructions
890890

891891
let Predicates = [IsRV64] in {
892-
let canFoldAsLoad = 1 in {
892+
let canFoldAsLoad = 1, isReMaterializable = 1 in {
893893
def LWU : Load_ri<0b110, "lwu">, Sched<[WriteLDW, ReadMemBase]>;
894894
def LD : Load_ri<0b011, "ld">, Sched<[WriteLDD, ReadMemBase]>;
895895
}

llvm/lib/Target/RISCV/RISCVInstrInfoD.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ defvar DExtsRV64 = [DExt, ZdinxExt];
7171
//===----------------------------------------------------------------------===//
7272

7373
let Predicates = [HasStdExtD] in {
74-
let canFoldAsLoad = 1 in
74+
let canFoldAsLoad = 1, isReMaterializable = 1 in
7575
def FLD : FPLoad_r<0b011, "fld", FPR64, WriteFLD64>;
7676

7777
// Operands for stores are in the order srcreg, base, offset rather than

llvm/lib/Target/RISCV/RISCVInstrInfoF.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ class PseudoFROUND<DAGOperand Ty, ValueType vt, ValueType intvt = XLenVT>
330330
//===----------------------------------------------------------------------===//
331331

332332
let Predicates = [HasStdExtF] in {
333-
let canFoldAsLoad = 1 in
333+
let canFoldAsLoad = 1, isReMaterializable = 1 in
334334
def FLW : FPLoad_r<0b010, "flw", FPR32, WriteFLD32>;
335335

336336
// Operands for stores are in the order srcreg, base, offset rather than

llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ defvar ZfhminDExts = [ZfhminDExt, ZhinxminZdinxExt, ZhinxminZdinx32Ext];
9090
//===----------------------------------------------------------------------===//
9191

9292
let Predicates = [HasHalfFPLoadStoreMove] in {
93-
let canFoldAsLoad = 1 in
93+
let canFoldAsLoad = 1, isReMaterializable = 1 in
9494
def FLH : FPLoad_r<0b001, "flh", FPR16, WriteFLD16>;
9595

9696
// Operands for stores are in the order srcreg, base, offset rather than

llvm/test/CodeGen/RISCV/remat.ll

Lines changed: 168 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -O1 -mtriple=riscv64 -verify-machineinstrs < %s | FileCheck %s
2+
; RUN: llc -O1 -mtriple=riscv64 -mattr=+d,+zfh,+zfbfmin -verify-machineinstrs < %s | FileCheck %s
33

44
@a = common global i32 0, align 4
55
@l = common global i32 0, align 4
@@ -200,3 +200,170 @@ for.end: ; preds = %for.inc, %entry
200200
}
201201

202202
declare i32 @foo(i32, i32, i32, i32, i32, i32)
203+
204+
define void @remat_load(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, double %8, double %9, double %10, double %11, double %12, double %13, double %14, double %15, i8 %stackarg0, i16 %stackarg1, i32 %stackarg2, i64 %stackarg3, half %stackarg4, bfloat %stackarg5, float %stackarg6, double %stackarg7, ptr %p) nounwind {
205+
; CHECK-LABEL: remat_load:
206+
; CHECK: # %bb.0: # %entry
207+
; CHECK-NEXT: addi sp, sp, -208
208+
; CHECK-NEXT: sd ra, 200(sp) # 8-byte Folded Spill
209+
; CHECK-NEXT: sd s0, 192(sp) # 8-byte Folded Spill
210+
; CHECK-NEXT: sd s1, 184(sp) # 8-byte Folded Spill
211+
; CHECK-NEXT: sd s2, 176(sp) # 8-byte Folded Spill
212+
; CHECK-NEXT: sd s3, 168(sp) # 8-byte Folded Spill
213+
; CHECK-NEXT: sd s4, 160(sp) # 8-byte Folded Spill
214+
; CHECK-NEXT: sd s5, 152(sp) # 8-byte Folded Spill
215+
; CHECK-NEXT: sd s6, 144(sp) # 8-byte Folded Spill
216+
; CHECK-NEXT: sd s7, 136(sp) # 8-byte Folded Spill
217+
; CHECK-NEXT: sd s8, 128(sp) # 8-byte Folded Spill
218+
; CHECK-NEXT: sd s9, 120(sp) # 8-byte Folded Spill
219+
; CHECK-NEXT: sd s10, 112(sp) # 8-byte Folded Spill
220+
; CHECK-NEXT: sd s11, 104(sp) # 8-byte Folded Spill
221+
; CHECK-NEXT: fsd fs0, 96(sp) # 8-byte Folded Spill
222+
; CHECK-NEXT: fsd fs1, 88(sp) # 8-byte Folded Spill
223+
; CHECK-NEXT: fsd fs2, 80(sp) # 8-byte Folded Spill
224+
; CHECK-NEXT: fsd fs3, 72(sp) # 8-byte Folded Spill
225+
; CHECK-NEXT: fsd fs4, 64(sp) # 8-byte Folded Spill
226+
; CHECK-NEXT: fsd fs5, 56(sp) # 8-byte Folded Spill
227+
; CHECK-NEXT: fsd fs6, 48(sp) # 8-byte Folded Spill
228+
; CHECK-NEXT: fsd fs7, 40(sp) # 8-byte Folded Spill
229+
; CHECK-NEXT: fsd fs8, 32(sp) # 8-byte Folded Spill
230+
; CHECK-NEXT: fsd fs9, 24(sp) # 8-byte Folded Spill
231+
; CHECK-NEXT: fsd fs10, 16(sp) # 8-byte Folded Spill
232+
; CHECK-NEXT: fsd fs11, 8(sp) # 8-byte Folded Spill
233+
; CHECK-NEXT: fld fa5, 264(sp)
234+
; CHECK-NEXT: flw fa4, 256(sp)
235+
; CHECK-NEXT: flh fa3, 248(sp)
236+
; CHECK-NEXT: flh fa2, 240(sp)
237+
; CHECK-NEXT: ld a0, 272(sp)
238+
; CHECK-NEXT: lbu a4, 208(sp)
239+
; CHECK-NEXT: lh a3, 216(sp)
240+
; CHECK-NEXT: lw a2, 224(sp)
241+
; CHECK-NEXT: ld a1, 232(sp)
242+
; CHECK-NEXT: sb a4, 0(a0)
243+
; CHECK-NEXT: sh a3, 0(a0)
244+
; CHECK-NEXT: sw a2, 0(a0)
245+
; CHECK-NEXT: sd a1, 0(a0)
246+
; CHECK-NEXT: fsh fa2, 0(a0)
247+
; CHECK-NEXT: fsh fa3, 0(a0)
248+
; CHECK-NEXT: fsw fa4, 0(a0)
249+
; CHECK-NEXT: fsd fa5, 0(a0)
250+
; CHECK-NEXT: #APP
251+
; CHECK-NEXT: #NO_APP
252+
; CHECK-NEXT: ld a0, 272(sp)
253+
; CHECK-NEXT: lbu a1, 208(sp)
254+
; CHECK-NEXT: sb a1, 0(a0)
255+
; CHECK-NEXT: lh a1, 216(sp)
256+
; CHECK-NEXT: sh a1, 0(a0)
257+
; CHECK-NEXT: lw a1, 224(sp)
258+
; CHECK-NEXT: sw a1, 0(a0)
259+
; CHECK-NEXT: ld a1, 232(sp)
260+
; CHECK-NEXT: sd a1, 0(a0)
261+
; CHECK-NEXT: flh fa5, 240(sp)
262+
; CHECK-NEXT: fsh fa5, 0(a0)
263+
; CHECK-NEXT: flh fa5, 248(sp)
264+
; CHECK-NEXT: fsh fa5, 0(a0)
265+
; CHECK-NEXT: flw fa5, 256(sp)
266+
; CHECK-NEXT: fsw fa5, 0(a0)
267+
; CHECK-NEXT: fld fa5, 264(sp)
268+
; CHECK-NEXT: fsd fa5, 0(a0)
269+
; CHECK-NEXT: ld ra, 200(sp) # 8-byte Folded Reload
270+
; CHECK-NEXT: ld s0, 192(sp) # 8-byte Folded Reload
271+
; CHECK-NEXT: ld s1, 184(sp) # 8-byte Folded Reload
272+
; CHECK-NEXT: ld s2, 176(sp) # 8-byte Folded Reload
273+
; CHECK-NEXT: ld s3, 168(sp) # 8-byte Folded Reload
274+
; CHECK-NEXT: ld s4, 160(sp) # 8-byte Folded Reload
275+
; CHECK-NEXT: ld s5, 152(sp) # 8-byte Folded Reload
276+
; CHECK-NEXT: ld s6, 144(sp) # 8-byte Folded Reload
277+
; CHECK-NEXT: ld s7, 136(sp) # 8-byte Folded Reload
278+
; CHECK-NEXT: ld s8, 128(sp) # 8-byte Folded Reload
279+
; CHECK-NEXT: ld s9, 120(sp) # 8-byte Folded Reload
280+
; CHECK-NEXT: ld s10, 112(sp) # 8-byte Folded Reload
281+
; CHECK-NEXT: ld s11, 104(sp) # 8-byte Folded Reload
282+
; CHECK-NEXT: fld fs0, 96(sp) # 8-byte Folded Reload
283+
; CHECK-NEXT: fld fs1, 88(sp) # 8-byte Folded Reload
284+
; CHECK-NEXT: fld fs2, 80(sp) # 8-byte Folded Reload
285+
; CHECK-NEXT: fld fs3, 72(sp) # 8-byte Folded Reload
286+
; CHECK-NEXT: fld fs4, 64(sp) # 8-byte Folded Reload
287+
; CHECK-NEXT: fld fs5, 56(sp) # 8-byte Folded Reload
288+
; CHECK-NEXT: fld fs6, 48(sp) # 8-byte Folded Reload
289+
; CHECK-NEXT: fld fs7, 40(sp) # 8-byte Folded Reload
290+
; CHECK-NEXT: fld fs8, 32(sp) # 8-byte Folded Reload
291+
; CHECK-NEXT: fld fs9, 24(sp) # 8-byte Folded Reload
292+
; CHECK-NEXT: fld fs10, 16(sp) # 8-byte Folded Reload
293+
; CHECK-NEXT: fld fs11, 8(sp) # 8-byte Folded Reload
294+
; CHECK-NEXT: addi sp, sp, 208
295+
; CHECK-NEXT: ret
296+
entry:
297+
; Add a use of the stack arguments here so that we will have to load them from
298+
; the stack before the inline asm. Otherwise we would be exercising the
299+
; machine scheduler, not rematerialization.
300+
store volatile i8 %stackarg0, ptr %p
301+
store volatile i16 %stackarg1, ptr %p
302+
store volatile i32 %stackarg2, ptr %p
303+
store volatile i64 %stackarg3, ptr %p
304+
store volatile half %stackarg4, ptr %p
305+
store volatile bfloat %stackarg5, ptr %p
306+
store volatile float %stackarg6, ptr %p
307+
store volatile double %stackarg7, ptr %p
308+
tail call void asm sideeffect "", "~{x1},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31},~{f0},~{f1},~{f2},~{f3},~{f4},~{f5},~{f6},~{f7},~{f8},~{f9},~{f10},~{f11},~{f12},~{f13},~{f14},~{f15},~{f16},~{f17},~{f18},~{f19},~{f20},~{f21},~{f22},~{f23},~{f24},~{f25},~{f26},~{f27},~{f28},~{f29},~{f30},~{f31}"()
309+
; Now use them after spilling everything to force rematerialization
310+
store volatile i8 %stackarg0, ptr %p
311+
store volatile i16 %stackarg1, ptr %p
312+
store volatile i32 %stackarg2, ptr %p
313+
store volatile i64 %stackarg3, ptr %p
314+
store volatile half %stackarg4, ptr %p
315+
store volatile bfloat %stackarg5, ptr %p
316+
store volatile float %stackarg6, ptr %p
317+
store volatile double %stackarg7, ptr %p
318+
ret void
319+
}
320+
321+
; We could remat the load of the constant global if we extended the live
322+
; interval of the high bits of the address.
323+
324+
@const = external constant i32
325+
define i32 @constglobal_load() nounwind {
326+
; CHECK-LABEL: constglobal_load:
327+
; CHECK: # %bb.0: # %entry
328+
; CHECK-NEXT: addi sp, sp, -112
329+
; CHECK-NEXT: sd ra, 104(sp) # 8-byte Folded Spill
330+
; CHECK-NEXT: sd s0, 96(sp) # 8-byte Folded Spill
331+
; CHECK-NEXT: sd s1, 88(sp) # 8-byte Folded Spill
332+
; CHECK-NEXT: sd s2, 80(sp) # 8-byte Folded Spill
333+
; CHECK-NEXT: sd s3, 72(sp) # 8-byte Folded Spill
334+
; CHECK-NEXT: sd s4, 64(sp) # 8-byte Folded Spill
335+
; CHECK-NEXT: sd s5, 56(sp) # 8-byte Folded Spill
336+
; CHECK-NEXT: sd s6, 48(sp) # 8-byte Folded Spill
337+
; CHECK-NEXT: sd s7, 40(sp) # 8-byte Folded Spill
338+
; CHECK-NEXT: sd s8, 32(sp) # 8-byte Folded Spill
339+
; CHECK-NEXT: sd s9, 24(sp) # 8-byte Folded Spill
340+
; CHECK-NEXT: sd s10, 16(sp) # 8-byte Folded Spill
341+
; CHECK-NEXT: sd s11, 8(sp) # 8-byte Folded Spill
342+
; CHECK-NEXT: lui a0, %hi(const)
343+
; CHECK-NEXT: lw a0, %lo(const)(a0)
344+
; CHECK-NEXT: sd a0, 0(sp) # 8-byte Folded Spill
345+
; CHECK-NEXT: #APP
346+
; CHECK-NEXT: #NO_APP
347+
; CHECK-NEXT: ld a0, 0(sp) # 8-byte Folded Reload
348+
; CHECK-NEXT: addiw a0, a0, 1
349+
; CHECK-NEXT: ld ra, 104(sp) # 8-byte Folded Reload
350+
; CHECK-NEXT: ld s0, 96(sp) # 8-byte Folded Reload
351+
; CHECK-NEXT: ld s1, 88(sp) # 8-byte Folded Reload
352+
; CHECK-NEXT: ld s2, 80(sp) # 8-byte Folded Reload
353+
; CHECK-NEXT: ld s3, 72(sp) # 8-byte Folded Reload
354+
; CHECK-NEXT: ld s4, 64(sp) # 8-byte Folded Reload
355+
; CHECK-NEXT: ld s5, 56(sp) # 8-byte Folded Reload
356+
; CHECK-NEXT: ld s6, 48(sp) # 8-byte Folded Reload
357+
; CHECK-NEXT: ld s7, 40(sp) # 8-byte Folded Reload
358+
; CHECK-NEXT: ld s8, 32(sp) # 8-byte Folded Reload
359+
; CHECK-NEXT: ld s9, 24(sp) # 8-byte Folded Reload
360+
; CHECK-NEXT: ld s10, 16(sp) # 8-byte Folded Reload
361+
; CHECK-NEXT: ld s11, 8(sp) # 8-byte Folded Reload
362+
; CHECK-NEXT: addi sp, sp, 112
363+
; CHECK-NEXT: ret
364+
entry:
365+
%global = load i32, ptr @const
366+
tail call void asm sideeffect "", "~{x1},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"()
367+
%a = add i32 %global, 1
368+
ret i32 %a
369+
}

llvm/test/CodeGen/RISCV/rvv/pr95865.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,6 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
4040
; CHECK-NEXT: li t0, 12
4141
; CHECK-NEXT: li s0, 4
4242
; CHECK-NEXT: li t1, 20
43-
; CHECK-NEXT: ld a1, 112(sp)
44-
; CHECK-NEXT: sd a1, 0(sp) # 8-byte Folded Spill
4543
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
4644
; CHECK-NEXT: vmv.v.i v8, 0
4745
; CHECK-NEXT: andi t3, a4, 1
@@ -142,7 +140,7 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
142140
; CHECK-NEXT: j .LBB0_11
143141
; CHECK-NEXT: .LBB0_12: # %for.body7.us.19
144142
; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
145-
; CHECK-NEXT: ld a0, 0(sp) # 8-byte Folded Reload
143+
; CHECK-NEXT: ld a0, 112(sp)
146144
; CHECK-NEXT: vmv.s.x v16, a0
147145
; CHECK-NEXT: vmv.v.i v8, 0
148146
; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma

0 commit comments

Comments
 (0)