Skip to content
This repository was archived by the owner on Sep 15, 2025. It is now read-only.

Commit 21b531e

Browse files
authored
[clang][llvm][aarch64] Add aarch64_sme_in_streaming_mode intrinsic (llvm#120265)
Replacing the extant streaming mode function call with an intrinsic allows us to make further optimisations around it. For example, if it's called within a function that has a known streaming mode, we can remove the dead code, and avoid the redundant conditional branch.
1 parent 064da42 commit 21b531e

File tree

7 files changed

+89
-32
lines changed

7 files changed

+89
-32
lines changed

clang/include/clang/Basic/arm_sme.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -716,6 +716,8 @@ let SMETargetGuard = "sme2" in {
716716
def SVZERO_ZT : Inst<"svzero_zt", "vi", "", MergeNone, "aarch64_sme_zero_zt", [IsOverloadNone, IsStreamingCompatible, IsOutZT0], [ImmCheck<0, ImmCheck0_0>]>;
717717
}
718718

719+
def IN_STREAMING_MODE : Inst<"__arm_in_streaming_mode", "sv", "Pc", MergeNone, "aarch64_sme_in_streaming_mode", [IsOverloadNone, IsStreamingCompatible], []>;
720+
719721
//
720722
// lookup table expand four contiguous registers
721723
//

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11327,6 +11327,19 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
1132711327
if (Builtin->LLVMIntrinsic == 0)
1132811328
return nullptr;
1132911329

11330+
if (BuiltinID == SME::BI__builtin_sme___arm_in_streaming_mode) {
11331+
// If we already know the streaming mode, don't bother with the intrinsic
11332+
// and emit a constant instead
11333+
const auto *FD = cast<FunctionDecl>(CurFuncDecl);
11334+
if (const auto *FPT = FD->getType()->getAs<FunctionProtoType>()) {
11335+
unsigned SMEAttrs = FPT->getAArch64SMEAttributes();
11336+
if (!(SMEAttrs & FunctionType::SME_PStateSMCompatibleMask)) {
11337+
bool IsStreaming = SMEAttrs & FunctionType::SME_PStateSMEnabledMask;
11338+
return ConstantInt::getBool(Builder.getContext(), IsStreaming);
11339+
}
11340+
}
11341+
}
11342+
1133011343
// Predicates must match the main datatype.
1133111344
for (unsigned i = 0, e = Ops.size(); i != e; ++i)
1133211345
if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))

clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_state_funs.c

Lines changed: 44 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,34 +6,53 @@
66

77
#include <arm_sme.h>
88

9-
// CHECK-LABEL: @test_in_streaming_mode(
9+
// CHECK-LABEL: @test_in_streaming_mode_streaming_compatible(
1010
// CHECK-NEXT: entry:
11-
// CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3:[0-9]+]]
12-
// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
13-
// CHECK-NEXT: [[AND_I:%.*]] = and i64 [[TMP1]], 1
14-
// CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0
15-
// CHECK-NEXT: ret i1 [[TOBOOL_I]]
11+
// CHECK-NEXT: [[TMP0:%.*]] = tail call i1 @llvm.aarch64.sme.in.streaming.mode()
12+
// CHECK-NEXT: ret i1 [[TMP0]]
1613
//
17-
// CPP-CHECK-LABEL: @_Z22test_in_streaming_modev(
14+
// CPP-CHECK-LABEL: @_Z43test_in_streaming_mode_streaming_compatiblev(
1815
// CPP-CHECK-NEXT: entry:
19-
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3:[0-9]+]]
20-
// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
21-
// CPP-CHECK-NEXT: [[AND_I:%.*]] = and i64 [[TMP1]], 1
22-
// CPP-CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0
23-
// CPP-CHECK-NEXT: ret i1 [[TOBOOL_I]]
16+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i1 @llvm.aarch64.sme.in.streaming.mode()
17+
// CPP-CHECK-NEXT: ret i1 [[TMP0]]
18+
//
19+
bool test_in_streaming_mode_streaming_compatible(void) __arm_streaming_compatible {
20+
return __arm_in_streaming_mode();
21+
}
22+
23+
// CHECK-LABEL: @test_in_streaming_mode_streaming(
24+
// CHECK-NEXT: entry:
25+
// CHECK-NEXT: ret i1 true
26+
//
27+
// CPP-CHECK-LABEL: @_Z32test_in_streaming_mode_streamingv(
28+
// CPP-CHECK-NEXT: entry:
29+
// CPP-CHECK-NEXT: ret i1 true
30+
//
31+
bool test_in_streaming_mode_streaming(void) __arm_streaming {
32+
//
33+
return __arm_in_streaming_mode();
34+
}
35+
36+
// CHECK-LABEL: @test_in_streaming_mode_non_streaming(
37+
// CHECK-NEXT: entry:
38+
// CHECK-NEXT: ret i1 false
39+
//
40+
// CPP-CHECK-LABEL: @_Z36test_in_streaming_mode_non_streamingv(
41+
// CPP-CHECK-NEXT: entry:
42+
// CPP-CHECK-NEXT: ret i1 false
2443
//
25-
bool test_in_streaming_mode(void) __arm_streaming_compatible {
44+
bool test_in_streaming_mode_non_streaming(void) {
2645
return __arm_in_streaming_mode();
2746
}
2847

2948
// CHECK-LABEL: @test_za_disable(
3049
// CHECK-NEXT: entry:
31-
// CHECK-NEXT: tail call void @__arm_za_disable() #[[ATTR3]]
50+
// CHECK-NEXT: tail call void @__arm_za_disable() #[[ATTR7:[0-9]+]]
3251
// CHECK-NEXT: ret void
3352
//
3453
// CPP-CHECK-LABEL: @_Z15test_za_disablev(
3554
// CPP-CHECK-NEXT: entry:
36-
// CPP-CHECK-NEXT: tail call void @__arm_za_disable() #[[ATTR3]]
55+
// CPP-CHECK-NEXT: tail call void @__arm_za_disable() #[[ATTR7:[0-9]+]]
3756
// CPP-CHECK-NEXT: ret void
3857
//
3958
void test_za_disable(void) __arm_streaming_compatible {
@@ -42,14 +61,14 @@ void test_za_disable(void) __arm_streaming_compatible {
4261

4362
// CHECK-LABEL: @test_has_sme(
4463
// CHECK-NEXT: entry:
45-
// CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3]]
64+
// CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR7]]
4665
// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
4766
// CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp slt i64 [[TMP1]], 0
4867
// CHECK-NEXT: ret i1 [[TOBOOL_I]]
4968
//
5069
// CPP-CHECK-LABEL: @_Z12test_has_smev(
5170
// CPP-CHECK-NEXT: entry:
52-
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3]]
71+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR7]]
5372
// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
5473
// CPP-CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp slt i64 [[TMP1]], 0
5574
// CPP-CHECK-NEXT: ret i1 [[TOBOOL_I]]
@@ -72,12 +91,12 @@ void test_svundef_za(void) __arm_streaming_compatible __arm_out("za") {
7291

7392
// CHECK-LABEL: @test_sc_memcpy(
7493
// CHECK-NEXT: entry:
75-
// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
94+
// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
7695
// CHECK-NEXT: ret ptr [[CALL]]
7796
//
7897
// CPP-CHECK-LABEL: @_Z14test_sc_memcpyPvPKvm(
7998
// CPP-CHECK-NEXT: entry:
80-
// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
99+
// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
81100
// CPP-CHECK-NEXT: ret ptr [[CALL]]
82101
//
83102
void *test_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_compatible {
@@ -86,12 +105,12 @@ void *test_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_comp
86105

87106
// CHECK-LABEL: @test_sc_memmove(
88107
// CHECK-NEXT: entry:
89-
// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
108+
// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
90109
// CHECK-NEXT: ret ptr [[CALL]]
91110
//
92111
// CPP-CHECK-LABEL: @_Z15test_sc_memmovePvPKvm(
93112
// CPP-CHECK-NEXT: entry:
94-
// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
113+
// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
95114
// CPP-CHECK-NEXT: ret ptr [[CALL]]
96115
//
97116
void *test_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_compatible {
@@ -100,12 +119,12 @@ void *test_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_com
100119

101120
// CHECK-LABEL: @test_sc_memset(
102121
// CHECK-NEXT: entry:
103-
// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
122+
// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
104123
// CHECK-NEXT: ret ptr [[CALL]]
105124
//
106125
// CPP-CHECK-LABEL: @_Z14test_sc_memsetPvim(
107126
// CPP-CHECK-NEXT: entry:
108-
// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
127+
// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
109128
// CPP-CHECK-NEXT: ret ptr [[CALL]]
110129
//
111130
void *test_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible {
@@ -114,12 +133,12 @@ void *test_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible {
114133

115134
// CHECK-LABEL: @test_sc_memchr(
116135
// CHECK-NEXT: entry:
117-
// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
136+
// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
118137
// CHECK-NEXT: ret ptr [[CALL]]
119138
//
120139
// CPP-CHECK-LABEL: @_Z14test_sc_memchrPvim(
121140
// CPP-CHECK-NEXT: entry:
122-
// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
141+
// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
123142
// CPP-CHECK-NEXT: ret ptr [[CALL]]
124143
//
125144
void *test_sc_memchr(void *s, int c, size_t n) __arm_streaming_compatible {

clang/utils/TableGen/SveEmitter.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1640,13 +1640,6 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) {
16401640
OS << " return x0 & (1ULL << 63);\n";
16411641
OS << "}\n\n";
16421642

1643-
OS << "__ai bool __arm_in_streaming_mode(void) __arm_streaming_compatible "
1644-
"{\n";
1645-
OS << " uint64_t x0, x1;\n";
1646-
OS << " __builtin_arm_get_sme_state(&x0, &x1);\n";
1647-
OS << " return x0 & 1;\n";
1648-
OS << "}\n\n";
1649-
16501643
OS << "void *__arm_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_compatible;\n";
16511644
OS << "void *__arm_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_compatible;\n";
16521645
OS << "void *__arm_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible;\n";

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2974,6 +2974,7 @@ let TargetPrefix = "aarch64" in {
29742974

29752975

29762976
def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
2977+
def int_aarch64_sme_in_streaming_mode : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrNoMem]>, ClangBuiltin<"__builtin_arm_in_streaming_mode">;
29772978

29782979
class SME_OuterProduct_Intrinsic
29792980
: DefaultAttrsIntrinsic<[],

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1183,6 +1183,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
11831183
setMaxDivRemBitWidthSupported(128);
11841184

11851185
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1186+
if (Subtarget->hasSME())
1187+
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
11861188

11871189
if (Subtarget->isNeonAvailable()) {
11881190
// FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
@@ -27429,6 +27431,15 @@ void AArch64TargetLowering::ReplaceNodeResults(
2742927431
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
2743027432
return;
2743127433
}
27434+
case Intrinsic::aarch64_sme_in_streaming_mode: {
27435+
SDLoc DL(N);
27436+
SDValue Chain = DAG.getEntryNode();
27437+
SDValue RuntimePStateSM =
27438+
getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
27439+
Results.push_back(
27440+
DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM));
27441+
return;
27442+
}
2743227443
case Intrinsic::experimental_vector_match:
2743327444
case Intrinsic::get_active_lane_mask: {
2743427445
if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
3+
4+
5+
define i1 @streaming_mode_streaming_compatible() #0 {
6+
; CHECK-LABEL: streaming_mode_streaming_compatible:
7+
; CHECK: // %bb.0:
8+
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
9+
; CHECK-NEXT: bl __arm_sme_state
10+
; CHECK-NEXT: and w0, w0, #0x1
11+
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
12+
; CHECK-NEXT: ret
13+
%mode = tail call noundef i1 @llvm.aarch64.sme.in.streaming.mode()
14+
ret i1 %mode
15+
}
16+
17+
18+
attributes #0 = {nounwind memory(none) "aarch64_pstate_sm_compatible"}

0 commit comments

Comments
 (0)