Skip to content

Commit 35b2d3c

Browse files
MacDuemahesh-attarde
authored andcommitted
[AArch64][SME] Allow SME peephole optimizations across SME pseudos (llvm#157655)
This allows folding `smstart/stops` in more cases.
1 parent 40eb20c commit 35b2d3c

File tree

3 files changed

+138
-12
lines changed

3 files changed

+138
-12
lines changed

llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,17 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(
184184
isSVERegOp(TRI, MRI, MI.getOperand(1)))
185185
Prev = nullptr;
186186
break;
187+
case AArch64::RestoreZAPseudo:
188+
case AArch64::InOutZAUsePseudo:
189+
case AArch64::CommitZASavePseudo:
190+
case AArch64::SMEStateAllocPseudo:
191+
case AArch64::RequiresZASavePseudo:
192+
// These instructions only depend on the ZA state, not the streaming mode,
193+
// so if the pair of smstart/stop is only changing the streaming mode, we
194+
// can permit these instructions.
195+
if (Prev->getOperand(0).getImm() != AArch64SVCR::SVCRSM)
196+
Prev = nullptr;
197+
break;
187198
case AArch64::ADJCALLSTACKDOWN:
188199
case AArch64::ADJCALLSTACKUP:
189200
case AArch64::ANDXri:

llvm/test/CodeGen/AArch64/sme-agnostic-za.ll

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,6 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
169169
; CHECK-NEWLOWERING-NEXT: smstop sm
170170
; CHECK-NEWLOWERING-NEXT: mov x0, x8
171171
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
172-
; CHECK-NEWLOWERING-NEXT: smstart sm
173-
; CHECK-NEWLOWERING-NEXT: smstop sm
174172
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
175173
; CHECK-NEWLOWERING-NEXT: smstart sm
176174
; CHECK-NEWLOWERING-NEXT: mov x8, x0
@@ -268,19 +266,11 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
268266
; CHECK-NEWLOWERING-NEXT: .LBB5_2:
269267
; CHECK-NEWLOWERING-NEXT: mov x0, x8
270268
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
269+
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
271270
; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_4
272271
; CHECK-NEWLOWERING-NEXT: // %bb.3:
273272
; CHECK-NEWLOWERING-NEXT: smstart sm
274273
; CHECK-NEWLOWERING-NEXT: .LBB5_4:
275-
; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_6
276-
; CHECK-NEWLOWERING-NEXT: // %bb.5:
277-
; CHECK-NEWLOWERING-NEXT: smstop sm
278-
; CHECK-NEWLOWERING-NEXT: .LBB5_6:
279-
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
280-
; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_8
281-
; CHECK-NEWLOWERING-NEXT: // %bb.7:
282-
; CHECK-NEWLOWERING-NEXT: smstart sm
283-
; CHECK-NEWLOWERING-NEXT: .LBB5_8:
284274
; CHECK-NEWLOWERING-NEXT: mov x8, x0
285275
; CHECK-NEWLOWERING-NEXT: mov x0, x19
286276
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore

llvm/test/CodeGen/AArch64/sme-peephole-opts.ll

Lines changed: 126 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s
2+
; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-new-sme-abi -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s
33

44
declare void @callee()
55
declare void @callee_sm() "aarch64_pstate_sm_enabled"
@@ -563,3 +563,128 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" {
563563
store <vscale x 4 x float> %res1, ptr %ptr
564564
ret void
565565
}
566+
567+
; normal caller -> streaming callees (with ZA state)
568+
define void @test14(ptr %callee) nounwind "aarch64_inout_za" {
569+
; CHECK-LABEL: test14:
570+
; CHECK: // %bb.0:
571+
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
572+
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
573+
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
574+
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
575+
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
576+
; CHECK-NEXT: add x29, sp, #64
577+
; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
578+
; CHECK-NEXT: sub sp, sp, #16
579+
; CHECK-NEXT: rdsvl x8, #1
580+
; CHECK-NEXT: mov x9, sp
581+
; CHECK-NEXT: msub x9, x8, x8, x9
582+
; CHECK-NEXT: mov sp, x9
583+
; CHECK-NEXT: sub x10, x29, #80
584+
; CHECK-NEXT: stp x9, x8, [x29, #-80]
585+
; CHECK-NEXT: msr TPIDR2_EL0, x10
586+
; CHECK-NEXT: smstart sm
587+
; CHECK-NEXT: bl callee_sm
588+
; CHECK-NEXT: bl callee_sm
589+
; CHECK-NEXT: smstop sm
590+
; CHECK-NEXT: smstart za
591+
; CHECK-NEXT: mrs x8, TPIDR2_EL0
592+
; CHECK-NEXT: sub x0, x29, #80
593+
; CHECK-NEXT: cbnz x8, .LBB15_2
594+
; CHECK-NEXT: // %bb.1:
595+
; CHECK-NEXT: bl __arm_tpidr2_restore
596+
; CHECK-NEXT: .LBB15_2:
597+
; CHECK-NEXT: msr TPIDR2_EL0, xzr
598+
; CHECK-NEXT: sub sp, x29, #64
599+
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
600+
; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
601+
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
602+
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
603+
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
604+
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
605+
; CHECK-NEXT: ret
606+
call void @callee_sm()
607+
call void @callee_sm()
608+
ret void
609+
}
610+
611+
; normal caller -> streaming callees (with ZA agnostic state)
612+
define void @test15(ptr %callee) nounwind "aarch64_za_state_agnostic" {
613+
; CHECK-LABEL: test15:
614+
; CHECK: // %bb.0:
615+
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
616+
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
617+
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
618+
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
619+
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
620+
; CHECK-NEXT: add x29, sp, #64
621+
; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
622+
; CHECK-NEXT: bl __arm_sme_state_size
623+
; CHECK-NEXT: sub sp, sp, x0
624+
; CHECK-NEXT: mov x20, sp
625+
; CHECK-NEXT: mov x0, x20
626+
; CHECK-NEXT: bl __arm_sme_save
627+
; CHECK-NEXT: smstart sm
628+
; CHECK-NEXT: bl callee_sm
629+
; CHECK-NEXT: bl callee_sm
630+
; CHECK-NEXT: smstop sm
631+
; CHECK-NEXT: mov x0, x20
632+
; CHECK-NEXT: bl __arm_sme_restore
633+
; CHECK-NEXT: sub sp, x29, #64
634+
; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
635+
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
636+
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
637+
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
638+
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
639+
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
640+
; CHECK-NEXT: ret
641+
call void @callee_sm()
642+
call void @callee_sm()
643+
ret void
644+
}
645+
646+
; locally streaming caller -> normal callees (with ZA state)
647+
define void @test16(ptr %callee) nounwind "aarch64_pstate_sm_body" "aarch64_new_za" {
648+
; CHECK-LABEL: test16:
649+
; CHECK: // %bb.0:
650+
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
651+
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
652+
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
653+
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
654+
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
655+
; CHECK-NEXT: add x29, sp, #64
656+
; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
657+
; CHECK-NEXT: sub sp, sp, #16
658+
; CHECK-NEXT: rdsvl x8, #1
659+
; CHECK-NEXT: mov x9, sp
660+
; CHECK-NEXT: msub x9, x8, x8, x9
661+
; CHECK-NEXT: mov sp, x9
662+
; CHECK-NEXT: stp x9, x8, [x29, #-80]
663+
; CHECK-NEXT: mrs x8, TPIDR2_EL0
664+
; CHECK-NEXT: cbz x8, .LBB17_2
665+
; CHECK-NEXT: // %bb.1:
666+
; CHECK-NEXT: bl __arm_tpidr2_save
667+
; CHECK-NEXT: msr TPIDR2_EL0, xzr
668+
; CHECK-NEXT: zero {za}
669+
; CHECK-NEXT: .LBB17_2:
670+
; CHECK-NEXT: smstart za
671+
; CHECK-NEXT: smstart sm
672+
; CHECK-NEXT: sub x8, x29, #80
673+
; CHECK-NEXT: msr TPIDR2_EL0, x8
674+
; CHECK-NEXT: smstop sm
675+
; CHECK-NEXT: bl callee
676+
; CHECK-NEXT: bl callee
677+
; CHECK-NEXT: msr TPIDR2_EL0, xzr
678+
; CHECK-NEXT: smstop za
679+
; CHECK-NEXT: sub sp, x29, #64
680+
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
681+
; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
682+
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
683+
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
684+
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
685+
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
686+
; CHECK-NEXT: ret
687+
call void @callee()
688+
call void @callee()
689+
ret void
690+
}

0 commit comments

Comments
 (0)