Skip to content

Commit c8ce2df

Browse files
committed
[AArch64][SME] Allow SME peephole optimizations across SME pseudos
This allows folding `smstart/stops` in more cases.
1 parent 96d5567 commit c8ce2df

File tree

3 files changed

+132
-12
lines changed

3 files changed

+132
-12
lines changed

llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,11 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(
184184
isSVERegOp(TRI, MRI, MI.getOperand(1)))
185185
Prev = nullptr;
186186
break;
187+
case AArch64::RestoreZAPseudo:
188+
case AArch64::InOutZAUsePseudo:
189+
case AArch64::CommitZASavePseudo:
190+
case AArch64::SMEStateAllocPseudo:
191+
case AArch64::RequiresZASavePseudo:
187192
case AArch64::ADJCALLSTACKDOWN:
188193
case AArch64::ADJCALLSTACKUP:
189194
case AArch64::ANDXri:

llvm/test/CodeGen/AArch64/sme-agnostic-za.ll

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,6 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
169169
; CHECK-NEWLOWERING-NEXT: smstop sm
170170
; CHECK-NEWLOWERING-NEXT: mov x0, x8
171171
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
172-
; CHECK-NEWLOWERING-NEXT: smstart sm
173-
; CHECK-NEWLOWERING-NEXT: smstop sm
174172
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
175173
; CHECK-NEWLOWERING-NEXT: smstart sm
176174
; CHECK-NEWLOWERING-NEXT: mov x8, x0
@@ -268,19 +266,11 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
268266
; CHECK-NEWLOWERING-NEXT: .LBB5_2:
269267
; CHECK-NEWLOWERING-NEXT: mov x0, x8
270268
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
269+
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
271270
; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_4
272271
; CHECK-NEWLOWERING-NEXT: // %bb.3:
273272
; CHECK-NEWLOWERING-NEXT: smstart sm
274273
; CHECK-NEWLOWERING-NEXT: .LBB5_4:
275-
; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_6
276-
; CHECK-NEWLOWERING-NEXT: // %bb.5:
277-
; CHECK-NEWLOWERING-NEXT: smstop sm
278-
; CHECK-NEWLOWERING-NEXT: .LBB5_6:
279-
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
280-
; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_8
281-
; CHECK-NEWLOWERING-NEXT: // %bb.7:
282-
; CHECK-NEWLOWERING-NEXT: smstart sm
283-
; CHECK-NEWLOWERING-NEXT: .LBB5_8:
284274
; CHECK-NEWLOWERING-NEXT: mov x8, x0
285275
; CHECK-NEWLOWERING-NEXT: mov x0, x19
286276
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore

llvm/test/CodeGen/AArch64/sme-peephole-opts.ll

Lines changed: 126 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s
2+
; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-new-sme-abi -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s
33

44
declare void @callee()
55
declare void @callee_sm() "aarch64_pstate_sm_enabled"
@@ -554,3 +554,128 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" {
554554
store <vscale x 4 x float> %res1, ptr %ptr
555555
ret void
556556
}
557+
558+
; normal caller -> streaming callees (with ZA state)
559+
define void @test14(ptr %callee) nounwind "aarch64_inout_za" {
560+
; CHECK-LABEL: test14:
561+
; CHECK: // %bb.0:
562+
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
563+
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
564+
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
565+
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
566+
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
567+
; CHECK-NEXT: add x29, sp, #64
568+
; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
569+
; CHECK-NEXT: sub sp, sp, #16
570+
; CHECK-NEXT: rdsvl x8, #1
571+
; CHECK-NEXT: mov x9, sp
572+
; CHECK-NEXT: msub x9, x8, x8, x9
573+
; CHECK-NEXT: mov sp, x9
574+
; CHECK-NEXT: sub x10, x29, #80
575+
; CHECK-NEXT: stp x9, x8, [x29, #-80]
576+
; CHECK-NEXT: msr TPIDR2_EL0, x10
577+
; CHECK-NEXT: smstart sm
578+
; CHECK-NEXT: bl callee_sm
579+
; CHECK-NEXT: bl callee_sm
580+
; CHECK-NEXT: smstop sm
581+
; CHECK-NEXT: smstart za
582+
; CHECK-NEXT: mrs x8, TPIDR2_EL0
583+
; CHECK-NEXT: sub x0, x29, #80
584+
; CHECK-NEXT: cbnz x8, .LBB15_2
585+
; CHECK-NEXT: // %bb.1:
586+
; CHECK-NEXT: bl __arm_tpidr2_restore
587+
; CHECK-NEXT: .LBB15_2:
588+
; CHECK-NEXT: msr TPIDR2_EL0, xzr
589+
; CHECK-NEXT: sub sp, x29, #64
590+
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
591+
; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
592+
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
593+
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
594+
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
595+
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
596+
; CHECK-NEXT: ret
597+
call void @callee_sm()
598+
call void @callee_sm()
599+
ret void
600+
}
601+
602+
; normal caller -> streaming callees (with ZA agnostic state)
603+
define void @test15(ptr %callee) nounwind "aarch64_za_state_agnostic" {
604+
; CHECK-LABEL: test15:
605+
; CHECK: // %bb.0:
606+
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
607+
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
608+
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
609+
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
610+
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
611+
; CHECK-NEXT: add x29, sp, #64
612+
; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
613+
; CHECK-NEXT: bl __arm_sme_state_size
614+
; CHECK-NEXT: sub sp, sp, x0
615+
; CHECK-NEXT: mov x20, sp
616+
; CHECK-NEXT: mov x0, x20
617+
; CHECK-NEXT: bl __arm_sme_save
618+
; CHECK-NEXT: smstart sm
619+
; CHECK-NEXT: bl callee_sm
620+
; CHECK-NEXT: bl callee_sm
621+
; CHECK-NEXT: smstop sm
622+
; CHECK-NEXT: mov x0, x20
623+
; CHECK-NEXT: bl __arm_sme_restore
624+
; CHECK-NEXT: sub sp, x29, #64
625+
; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
626+
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
627+
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
628+
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
629+
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
630+
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
631+
; CHECK-NEXT: ret
632+
call void @callee_sm()
633+
call void @callee_sm()
634+
ret void
635+
}
636+
637+
; locally streaming caller -> normal callees (with ZA state)
638+
define void @test16(ptr %callee) nounwind "aarch64_pstate_sm_body" "aarch64_new_za" {
639+
; CHECK-LABEL: test16:
640+
; CHECK: // %bb.0:
641+
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
642+
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
643+
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
644+
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
645+
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
646+
; CHECK-NEXT: add x29, sp, #64
647+
; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
648+
; CHECK-NEXT: sub sp, sp, #16
649+
; CHECK-NEXT: rdsvl x8, #1
650+
; CHECK-NEXT: mov x9, sp
651+
; CHECK-NEXT: msub x9, x8, x8, x9
652+
; CHECK-NEXT: mov sp, x9
653+
; CHECK-NEXT: stp x9, x8, [x29, #-80]
654+
; CHECK-NEXT: mrs x8, TPIDR2_EL0
655+
; CHECK-NEXT: cbz x8, .LBB17_2
656+
; CHECK-NEXT: // %bb.1:
657+
; CHECK-NEXT: bl __arm_tpidr2_save
658+
; CHECK-NEXT: msr TPIDR2_EL0, xzr
659+
; CHECK-NEXT: zero {za}
660+
; CHECK-NEXT: .LBB17_2:
661+
; CHECK-NEXT: smstart za
662+
; CHECK-NEXT: smstart sm
663+
; CHECK-NEXT: sub x8, x29, #80
664+
; CHECK-NEXT: msr TPIDR2_EL0, x8
665+
; CHECK-NEXT: smstop sm
666+
; CHECK-NEXT: bl callee
667+
; CHECK-NEXT: bl callee
668+
; CHECK-NEXT: msr TPIDR2_EL0, xzr
669+
; CHECK-NEXT: smstop za
670+
; CHECK-NEXT: sub sp, x29, #64
671+
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
672+
; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
673+
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
674+
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
675+
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
676+
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
677+
; CHECK-NEXT: ret
678+
call void @callee()
679+
call void @callee()
680+
ret void
681+
}

0 commit comments

Comments
 (0)