Skip to content

Commit 3081959

Browse files
committed
[AArch64] Implement intrinsics for SME FP8 FMLAL/FMLALL (single)
1 parent 6559b69 commit 3081959

File tree

7 files changed

+381
-38
lines changed

7 files changed

+381
-38
lines changed

clang/include/clang/Basic/arm_sme.td

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -829,25 +829,41 @@ let SMETargetGuard = "sme-f8f32" in {
829829
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<0, ImmCheck0_3>]>;
830830

831831
// FMLALL (indexed)
832-
def SVMLA_FP8_ZA32_VG4x1 : Inst<"svmla_lane_za32[_mf8]_vg4x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x1",
833-
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
834-
def SVMLA_FP8_ZA32_VG4x2 : Inst<"svmla_lane_za32[_mf8]_vg4x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x2",
835-
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
836-
def SVMLA_FP8_ZA16_VG4x4 : Inst<"svmla_lane_za32[_mf8]_vg4x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x4",
837-
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
832+
def SVMLA_FP8_LANE_ZA32_VG4x1 : Inst<"svmla_lane_za32[_mf8]_vg4x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x1",
833+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
834+
def SVMLA_FP8_LANE_ZA32_VG4x2 : Inst<"svmla_lane_za32[_mf8]_vg4x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x2",
835+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
836+
def SVMLA_FP8_LANE_ZA16_VG4x4 : Inst<"svmla_lane_za32[_mf8]_vg4x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x4",
837+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
838+
839+
// FMLAL (mutliple and single)
840+
def SVMLA_FP8_SINGLE_ZA32_VG4x1 : Inst<"svmla[_single]_za32[_mf8]_vg4x1_fpm", "vmdd>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x1",
841+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
842+
def SVMLA_FP8_SINGLE_ZA32_VG4x2 : Inst<"svmla[_single]_za32[_mf8]_vg4x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x2",
843+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
844+
def SVMLA_FP8_SINGLE_ZA32_VG4x4 : Inst<"svmla[_single]_za32[_mf8]_vg4x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x4",
845+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
838846
}
839847

840848
let SMETargetGuard = "sme-f8f16" in {
841849
def SVMOPA_FP8_ZA16 : Inst<"svmopa_za16[_mf8]_m_fpm", "viPPdd>", "m", MergeNone, "aarch64_sme_fp8_fmopa_za16",
842850
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<0, ImmCheck0_1>]>;
843851

844852
// FMLAL (indexed)
845-
def SVMLA_FP8_ZA16_VG2x1 : Inst<"svmla_lane_za16[_mf8]_vg2x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x1",
846-
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
847-
def SVMLA_FP8_ZA16_VG2x2 : Inst<"svmla_lane_za16[_mf8]_vg2x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x2",
848-
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
849-
def SVMLA_FP8_ZA16_VG2x4 : Inst<"svmla_lane_za16[_mf8]_vg2x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x4",
850-
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
853+
def SVMLA_FP8_LANE_ZA16_VG2x1 : Inst<"svmla_lane_za16[_mf8]_vg2x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x1",
854+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
855+
def SVMLA_FP8_LANE_ZA16_VG2x2 : Inst<"svmla_lane_za16[_mf8]_vg2x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x2",
856+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
857+
def SVMLA_FP8_LANE_ZA16_VG2x4 : Inst<"svmla_lane_za16[_mf8]_vg2x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x4",
858+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
859+
860+
// FMLAL (mutliple and single)
861+
def SVMLA_FP8_SINGLE_ZA16_VG2x1 : Inst<"svmla[_single]_za16[_mf8]_vg2x1_fpm", "vmdd>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x1",
862+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
863+
def SVMLA_FP8_SINGLE_ZA16_VG2x2 : Inst<"svmla[_single]_za16[_mf8]_vg2x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x2",
864+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
865+
def SVMLA_FP8_SINGLE_ZA16_VG2x4 : Inst<"svmla[_single]_za16[_mf8]_vg2x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x4",
866+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
851867
}
852868

853869
} // let SVETargetGuard = InvalidMode

clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_mla.c

Lines changed: 116 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@
1010
#include <arm_sme.h>
1111

1212
#ifdef SME_OVERLOADED_FORMS
13-
#define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
13+
#define SME_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED, A5) A1##A3##A5
1414
#else
15-
#define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3
15+
#define SME_ACLE_FUNC(A1, A2, A3, A4, A5) A1##A2##A3##A4##A5
1616
#endif
1717

1818
// FMLAL (indexed)
@@ -32,7 +32,7 @@
3232
// CPP-CHECK-NEXT: ret void
3333
//
3434
void test_svmla_lane_za16_vg2x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
35-
SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x1_fpm)(slice, zn, zm, 0, fpm);
35+
SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x1_fpm,,)(slice, zn, zm, 0, fpm);
3636
}
3737

3838
// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_vg2x2(
@@ -50,7 +50,7 @@ void test_svmla_lane_za16_vg2x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm,
5050
// CPP-CHECK-NEXT: ret void
5151
//
5252
void test_svmla_lane_za16_vg2x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
53-
SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x2_fpm)(slice, zn, zm, 15, fpm);
53+
SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x2_fpm,,)(slice, zn, zm, 15, fpm);
5454
}
5555

5656
// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_vg2x4(
@@ -68,7 +68,7 @@ void test_svmla_lane_za16_vg2x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm
6868
// CPP-CHECK-NEXT: ret void
6969
//
7070
void test_svmla_lane_za16_vg2x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
71-
SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x4_fpm)(slice, zn, zm, 7, fpm);
71+
SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x4_fpm,,)(slice, zn, zm, 7, fpm);
7272
}
7373

7474
// FMLALL (indexed)
@@ -88,7 +88,7 @@ void test_svmla_lane_za16_vg2x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm
8888
// CPP-CHECK-NEXT: ret void
8989
//
9090
void test_svmla_lane_za32_vg4x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
91-
SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x1_fpm)(slice, zn, zm, 0, fpm);
91+
SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x1_fpm,,)(slice, zn, zm, 0, fpm);
9292
}
9393

9494
// CHECK-LABEL: define dso_local void @test_svmla_lane_za32_vg4x2(
@@ -106,7 +106,7 @@ void test_svmla_lane_za32_vg4x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm,
106106
// CPP-CHECK-NEXT: ret void
107107
//
108108
void test_svmla_lane_za32_vg4x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
109-
SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x2_fpm)(slice, zn, zm, 15, fpm);
109+
SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x2_fpm,,)(slice, zn, zm, 15, fpm);
110110
}
111111

112112
// CHECK-LABEL: define dso_local void @test_svmla_lane_za32_vg4x4(
@@ -124,5 +124,113 @@ void test_svmla_lane_za32_vg4x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm
124124
// CPP-CHECK-NEXT: ret void
125125
//
126126
void test_svmla_lane_za32_vg4x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
127-
SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x4_fpm)(slice, zn, zm, 7, fpm);
127+
SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x4_fpm,,)(slice, zn, zm, 7, fpm);
128+
}
129+
130+
// CHECK-LABEL: define dso_local void @test_svmla_single_za16_vg2x1(
131+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
132+
// CHECK-NEXT: [[ENTRY:.*:]]
133+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
134+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
135+
// CHECK-NEXT: ret void
136+
//
137+
// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za16_vg2x1ju13__SVMfloat8_tS_m(
138+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
139+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
140+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
141+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
142+
// CPP-CHECK-NEXT: ret void
143+
//
144+
void test_svmla_single_za16_vg2x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
145+
SME_ACLE_FUNC(svmla,_single,_za16,_mf8,_vg2x1_fpm)(slice, zn, zm, fpm);
146+
}
147+
148+
// CHECK-LABEL: define dso_local void @test_svmla_single_za16_vg2x2(
149+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
150+
// CHECK-NEXT: [[ENTRY:.*:]]
151+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
152+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
153+
// CHECK-NEXT: ret void
154+
//
155+
// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za16_vg2x2j13svmfloat8x2_tu13__SVMfloat8_tm(
156+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
157+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
158+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
159+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
160+
// CPP-CHECK-NEXT: ret void
161+
//
162+
void test_svmla_single_za16_vg2x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
163+
SME_ACLE_FUNC(svmla,_single,_za16,_mf8,_vg2x2_fpm)(slice, zn, zm, fpm);
164+
}
165+
166+
// CHECK-LABEL: define dso_local void @test_svmla_single_za16_vg2x4(
167+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
168+
// CHECK-NEXT: [[ENTRY:.*:]]
169+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
170+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]])
171+
// CHECK-NEXT: ret void
172+
//
173+
// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za16_vg2x4j13svmfloat8x4_tu13__SVMfloat8_tm(
174+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
175+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
176+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
177+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]])
178+
// CPP-CHECK-NEXT: ret void
179+
//
180+
void test_svmla_single_za16_vg2x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
181+
SME_ACLE_FUNC(svmla,_single,_za16,_mf8,_vg2x4_fpm)(slice, zn, zm, fpm);
182+
}
183+
184+
// CHECK-LABEL: define dso_local void @test_svmla_single_za32_vg4x1(
185+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
186+
// CHECK-NEXT: [[ENTRY:.*:]]
187+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
188+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
189+
// CHECK-NEXT: ret void
190+
//
191+
// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za32_vg4x1ju13__SVMfloat8_tS_m(
192+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
193+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
194+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
195+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
196+
// CPP-CHECK-NEXT: ret void
197+
//
198+
void test_svmla_single_za32_vg4x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
199+
SME_ACLE_FUNC(svmla,_single,_za32,_mf8,_vg4x1_fpm)(slice, zn, zm, fpm);
200+
}
201+
202+
// CHECK-LABEL: define dso_local void @test_svmla_single_za32_vg4x2(
203+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
204+
// CHECK-NEXT: [[ENTRY:.*:]]
205+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
206+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
207+
// CHECK-NEXT: ret void
208+
//
209+
// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za32_vg4x2j13svmfloat8x2_tu13__SVMfloat8_tm(
210+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
211+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
212+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
213+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
214+
// CPP-CHECK-NEXT: ret void
215+
//
216+
void test_svmla_single_za32_vg4x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
217+
SME_ACLE_FUNC(svmla,_single,_za32,_mf8,_vg4x2_fpm)(slice, zn, zm, fpm);
218+
}
219+
220+
// CHECK-LABEL: define dso_local void @test_svmla_single_za32_vg4x4(
221+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
222+
// CHECK-NEXT: [[ENTRY:.*:]]
223+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
224+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]])
225+
// CHECK-NEXT: ret void
226+
//
227+
// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za32_vg4x4j13svmfloat8x4_tu13__SVMfloat8_tm(
228+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
229+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
230+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
231+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]])
232+
// CPP-CHECK-NEXT: ret void
233+
//
234+
void test_svmla_single_za32_vg4x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
235+
SME_ACLE_FUNC(svmla,_single,_za32,_mf8,_vg4x4_fpm)(slice, zn, zm, fpm);
128236
}

clang/test/Sema/aarch64-fp8-intrinsics/acle_sme_fp8_mla.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,22 @@ void test_svmla(uint32_t slice, svmfloat8_t zn, svmfloat8x2_t znx2, svmfloat8x4_
2323

2424
// expected-error@+1 {{'svmla_lane_za32_mf8_vg4x4_fpm' needs target feature sme,sme-f8f32}}
2525
svmla_lane_za32_mf8_vg4x4_fpm(slice, znx4, zn, 0, fpmr);
26+
27+
// expected-error@+1 {{'svmla_single_za16_mf8_vg2x1_fpm' needs target feature sme,sme-f8f16}}
28+
svmla_single_za16_mf8_vg2x1_fpm(slice, zn, zn, fpmr);
29+
30+
// expected-error@+1 {{'svmla_single_za16_mf8_vg2x2_fpm' needs target feature sme,sme-f8f16}}
31+
svmla_single_za16_mf8_vg2x2_fpm(slice, znx2, zn, fpmr);
32+
33+
// expected-error@+1 {{'svmla_single_za16_mf8_vg2x4_fpm' needs target feature sme,sme-f8f16}}
34+
svmla_single_za16_mf8_vg2x4_fpm(slice, znx4, zn, fpmr);
35+
36+
// expected-error@+1 {{'svmla_single_za32_mf8_vg4x1_fpm' needs target feature sme,sme-f8f32}}
37+
svmla_single_za32_mf8_vg4x1_fpm(slice, zn, zn, fpmr);
38+
39+
// expected-error@+1 {{'svmla_single_za32_mf8_vg4x2_fpm' needs target feature sme,sme-f8f32}}
40+
svmla_single_za32_mf8_vg4x2_fpm(slice, znx2, zn, fpmr);
41+
42+
// expected-error@+1 {{'svmla_single_za32_mf8_vg4x4_fpm' needs target feature sme,sme-f8f32}}
43+
svmla_single_za32_mf8_vg4x4_fpm(slice, znx4, zn, fpmr);
2644
}

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3902,13 +3902,39 @@ class SME_FP8_ZA_LANE_VGx4 :
39023902
llvm_i32_ty],
39033903
[IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<6>>]>;
39043904

3905+
class SME_FP8_ZA_SINGLE_VGx1 :
3906+
DefaultAttrsIntrinsic<[], [llvm_i32_ty,
3907+
llvm_nxv16i8_ty,
3908+
llvm_nxv16i8_ty],
3909+
[IntrInaccessibleMemOnly, IntrHasSideEffects]>;
3910+
3911+
class SME_FP8_ZA_SINGLE_VGx2 :
3912+
DefaultAttrsIntrinsic<[], [llvm_i32_ty,
3913+
llvm_nxv16i8_ty, llvm_nxv16i8_ty,
3914+
llvm_nxv16i8_ty],
3915+
[IntrInaccessibleMemOnly, IntrHasSideEffects]>;
3916+
3917+
class SME_FP8_ZA_SINGLE_VGx4 :
3918+
DefaultAttrsIntrinsic<[], [llvm_i32_ty,
3919+
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
3920+
llvm_nxv16i8_ty],
3921+
[IntrInaccessibleMemOnly, IntrHasSideEffects]>;
3922+
39053923
// Double-vector groups (F8F16)
39063924
def int_aarch64_sme_fp8_fmlal_lane_za16_vg2x1 : SME_FP8_ZA_LANE_VGx1;
39073925
def int_aarch64_sme_fp8_fmlal_lane_za16_vg2x2 : SME_FP8_ZA_LANE_VGx2;
39083926
def int_aarch64_sme_fp8_fmlal_lane_za16_vg2x4 : SME_FP8_ZA_LANE_VGx4;
39093927

3928+
def int_aarch64_sme_fp8_fmlal_single_za16_vg2x1 : SME_FP8_ZA_SINGLE_VGx1;
3929+
def int_aarch64_sme_fp8_fmlal_single_za16_vg2x2 : SME_FP8_ZA_SINGLE_VGx2;
3930+
def int_aarch64_sme_fp8_fmlal_single_za16_vg2x4 : SME_FP8_ZA_SINGLE_VGx4;
3931+
39103932
// Quad-vector groups (F8F32)
39113933
def int_aarch64_sme_fp8_fmlall_lane_za32_vg4x1 : SME_FP8_ZA_LANE_VGx1;
39123934
def int_aarch64_sme_fp8_fmlall_lane_za32_vg4x2 : SME_FP8_ZA_LANE_VGx2;
39133935
def int_aarch64_sme_fp8_fmlall_lane_za32_vg4x4 : SME_FP8_ZA_LANE_VGx4;
3936+
3937+
def int_aarch64_sme_fp8_fmlall_single_za32_vg4x1 : SME_FP8_ZA_SINGLE_VGx1;
3938+
def int_aarch64_sme_fp8_fmlall_single_za32_vg4x2 : SME_FP8_ZA_SINGLE_VGx2;
3939+
def int_aarch64_sme_fp8_fmlall_single_za32_vg4x4 : SME_FP8_ZA_SINGLE_VGx4;
39143940
}

0 commit comments

Comments
 (0)