Skip to content

Commit ad6aadb

Browse files
committed
[AArch64][llvm] Add support for vmmlaq_[f16,f32]_mf8 intrinsics
Add support for the following new intrinsics: ``` float16x8_t vmmlaq_f16_mf8_fpm(float16x8_t, mfloat8x16_t, mfloat8x16_t, fpm_t); float32x4_t vmmlaq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, fpm_t); ```
1 parent e964acf commit ad6aadb

File tree

4 files changed

+48
-1
lines changed

4 files changed

+48
-1
lines changed

clang/include/clang/Basic/arm_neon.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1896,6 +1896,14 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f
18961896
def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>;
18971897
}
18981898

1899+
let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f16mm,neon" in {
1900+
def VMMLA_F16_MF8 : VInst<"vmmla_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
1901+
}
1902+
1903+
let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f32mm,neon" in {
1904+
def VMMLA_F32_MF8 : VInst<"vmmla_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
1905+
}
1906+
18991907
let TargetGuard = "i8mm,neon" in {
19001908
def VMMLA : SInst<"vmmla", "..(<<)(<<)", "QUiQi">;
19011909
def VUSMMLA : SInst<"vusmmla", "..(<<U)(<<)", "Qi">;

clang/lib/CodeGen/TargetBuiltins/ARM.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7793,6 +7793,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
77937793
Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
77947794
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
77957795
}
7796+
case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm:
7797+
return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
7798+
{llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
7799+
"fmmla");
7800+
case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm:
7801+
return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
7802+
{llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
7803+
"fmmla");
77967804
case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
77977805
ExtractLow = true;
77987806
[[fallthrough]];

clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2-
// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \
2+
// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm -target-feature +f8f16mm -target-feature +f8f32mm -target-feature +fp8 \
33
// RUN: -disable-O0-optnone -emit-llvm -o - %s \
44
// RUN: | opt -S -passes=mem2reg,sroa \
55
// RUN: | FileCheck %s
@@ -32,6 +32,31 @@ uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) {
3232
return vmmlaq_u32(r, a, b);
3333
}
3434

35+
// CHECK-LABEL: define dso_local <8 x half> @test_vmmlaq_f16_mf8(
36+
// CHECK-SAME: <8 x half> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] {
37+
// CHECK-NEXT: [[ENTRY:.*:]]
38+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[P0]] to <8 x i16>
39+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
40+
// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
41+
// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
42+
// CHECK-NEXT: [[FMMLA1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16(<8 x half> [[FMMLA_I]], <16 x i8> [[P1]], <16 x i8> [[P2]])
43+
// CHECK-NEXT: ret <8 x half> [[FMMLA1_I]]
44+
//
45+
float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
46+
return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3);
47+
}
48+
49+
// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8(
50+
// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] {
51+
// CHECK-NEXT: [[ENTRY:.*:]]
52+
// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
53+
// CHECK-NEXT: [[FMMLA_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32(<4 x float> [[P0]], <16 x i8> [[P1]], <16 x i8> [[P2]])
54+
// CHECK-NEXT: ret <4 x float> [[FMMLA_I]]
55+
//
56+
float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
57+
return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3);
58+
}
59+
3560
// CHECK-LABEL: define dso_local <4 x i32> @test_vusmmlaq_s32(
3661
// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
3762
// CHECK-NEXT: [[ENTRY:.*:]]

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,11 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
217217
: DefaultAttrsIntrinsic<[llvm_v4f32_ty],
218218
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
219219
[IntrNoMem]>;
220+
221+
class AdvSIMD_MatMul_fpm_Intrinsic
222+
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
223+
[LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty],
224+
[IntrNoMem]>;
220225
}
221226

222227
// Arithmetic ops
@@ -499,6 +504,7 @@ let TargetPrefix = "aarch64" in {
499504
def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
500505
def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
501506
def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
507+
def int_aarch64_neon_fmmla : AdvSIMD_MatMul_fpm_Intrinsic;
502508
def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
503509
def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
504510
def int_aarch64_neon_bfmmla

0 commit comments

Comments
 (0)