-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[AArch64][llvm] Add support for Neon vmmlaq_{f16,f32}_mf8_fpm intrinsics #165431
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-clang @llvm/pr-subscribers-backend-aarch64 Author: Jonathan Thackray (jthackray) ChangesAdd support for the following new AArch64 Neon intrinsics: Full diff: https://github.com/llvm/llvm-project/pull/165431.diff 4 Files Affected:
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index ef196103035e8..8e2174c880ed8 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -1896,6 +1896,14 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f
def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>;
}
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f16mm,neon" in {
+ def VMMLA_F16_MF8 : VInst<"vmmla_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+}
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f32mm,neon" in {
+ def VMMLA_F32_MF8 : VInst<"vmmla_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+}
+
let TargetGuard = "i8mm,neon" in {
def VMMLA : SInst<"vmmla", "..(<<)(<<)", "QUiQi">;
def VUSMMLA : SInst<"vusmmla", "..(<<U)(<<)", "Qi">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 60f9b86333670..4075f56e6a032 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -7793,6 +7793,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
}
+ case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm:
+ return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
+ {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
+ "fmmla");
+ case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm:
+ return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
+ {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
+ "fmmla");
case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
ExtractLow = true;
[[fallthrough]];
diff --git a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c
index 6fffcb6c6b391..0d592af59f85c 100644
--- a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c
@@ -1,5 +1,5 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm -target-feature +f8f16mm -target-feature +f8f32mm -target-feature +fp8 \
// RUN: -disable-O0-optnone -emit-llvm -o - %s \
// RUN: | opt -S -passes=mem2reg,sroa \
// RUN: | FileCheck %s
@@ -32,6 +32,31 @@ uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) {
return vmmlaq_u32(r, a, b);
}
+// CHECK-LABEL: define dso_local <8 x half> @test_vmmlaq_f16_mf8(
+// CHECK-SAME: <8 x half> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[P0]] to <8 x i16>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
+// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT: [[FMMLA1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16(<8 x half> [[FMMLA_I]], <16 x i8> [[P1]], <16 x i8> [[P2]])
+// CHECK-NEXT: ret <8 x half> [[FMMLA1_I]]
+//
+float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
+ return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8(
+// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
+// CHECK-NEXT: [[FMMLA_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32(<4 x float> [[P0]], <16 x i8> [[P1]], <16 x i8> [[P2]])
+// CHECK-NEXT: ret <4 x float> [[FMMLA_I]]
+//
+float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
+ return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3);
+}
+
// CHECK-LABEL: define dso_local <4 x i32> @test_vusmmlaq_s32(
// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index b81edc385cd43..78a60e839775e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -217,6 +217,11 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
: DefaultAttrsIntrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
[IntrNoMem]>;
+
+ class AdvSIMD_MatMul_fpm_Intrinsic
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty],
+ [IntrNoMem]>;
}
// Arithmetic ops
@@ -499,6 +504,7 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
+ def int_aarch64_neon_fmmla : AdvSIMD_MatMul_fpm_Intrinsic;
def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
def int_aarch64_neon_bfmmla
|
|
@llvm/pr-subscribers-clang-codegen Author: Jonathan Thackray (jthackray) ChangesAdd support for the following new AArch64 Neon intrinsics: Full diff: https://github.com/llvm/llvm-project/pull/165431.diff 4 Files Affected:
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index ef196103035e8..8e2174c880ed8 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -1896,6 +1896,14 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f
def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>;
}
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f16mm,neon" in {
+ def VMMLA_F16_MF8 : VInst<"vmmla_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+}
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f32mm,neon" in {
+ def VMMLA_F32_MF8 : VInst<"vmmla_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+}
+
let TargetGuard = "i8mm,neon" in {
def VMMLA : SInst<"vmmla", "..(<<)(<<)", "QUiQi">;
def VUSMMLA : SInst<"vusmmla", "..(<<U)(<<)", "Qi">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 60f9b86333670..4075f56e6a032 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -7793,6 +7793,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
}
+ case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm:
+ return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
+ {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
+ "fmmla");
+ case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm:
+ return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
+ {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
+ "fmmla");
case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
ExtractLow = true;
[[fallthrough]];
diff --git a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c
index 6fffcb6c6b391..0d592af59f85c 100644
--- a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c
@@ -1,5 +1,5 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm -target-feature +f8f16mm -target-feature +f8f32mm -target-feature +fp8 \
// RUN: -disable-O0-optnone -emit-llvm -o - %s \
// RUN: | opt -S -passes=mem2reg,sroa \
// RUN: | FileCheck %s
@@ -32,6 +32,31 @@ uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) {
return vmmlaq_u32(r, a, b);
}
+// CHECK-LABEL: define dso_local <8 x half> @test_vmmlaq_f16_mf8(
+// CHECK-SAME: <8 x half> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[P0]] to <8 x i16>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
+// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT: [[FMMLA1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16(<8 x half> [[FMMLA_I]], <16 x i8> [[P1]], <16 x i8> [[P2]])
+// CHECK-NEXT: ret <8 x half> [[FMMLA1_I]]
+//
+float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
+ return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8(
+// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
+// CHECK-NEXT: [[FMMLA_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32(<4 x float> [[P0]], <16 x i8> [[P1]], <16 x i8> [[P2]])
+// CHECK-NEXT: ret <4 x float> [[FMMLA_I]]
+//
+float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
+ return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3);
+}
+
// CHECK-LABEL: define dso_local <4 x i32> @test_vusmmlaq_s32(
// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index b81edc385cd43..78a60e839775e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -217,6 +217,11 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
: DefaultAttrsIntrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
[IntrNoMem]>;
+
+ class AdvSIMD_MatMul_fpm_Intrinsic
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty],
+ [IntrNoMem]>;
}
// Arithmetic ops
@@ -499,6 +504,7 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
+ def int_aarch64_neon_fmmla : AdvSIMD_MatMul_fpm_Intrinsic;
def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
def int_aarch64_neon_bfmmla
|
|
@llvm/pr-subscribers-llvm-ir Author: Jonathan Thackray (jthackray) ChangesAdd support for the following new AArch64 Neon intrinsics: Full diff: https://github.com/llvm/llvm-project/pull/165431.diff 4 Files Affected:
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index ef196103035e8..8e2174c880ed8 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -1896,6 +1896,14 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f
def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>;
}
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f16mm,neon" in {
+ def VMMLA_F16_MF8 : VInst<"vmmla_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+}
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f32mm,neon" in {
+ def VMMLA_F32_MF8 : VInst<"vmmla_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+}
+
let TargetGuard = "i8mm,neon" in {
def VMMLA : SInst<"vmmla", "..(<<)(<<)", "QUiQi">;
def VUSMMLA : SInst<"vusmmla", "..(<<U)(<<)", "Qi">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 60f9b86333670..4075f56e6a032 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -7793,6 +7793,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
}
+ case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm:
+ return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
+ {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
+ "fmmla");
+ case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm:
+ return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
+ {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
+ "fmmla");
case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
ExtractLow = true;
[[fallthrough]];
diff --git a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c
index 6fffcb6c6b391..0d592af59f85c 100644
--- a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c
@@ -1,5 +1,5 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm -target-feature +f8f16mm -target-feature +f8f32mm -target-feature +fp8 \
// RUN: -disable-O0-optnone -emit-llvm -o - %s \
// RUN: | opt -S -passes=mem2reg,sroa \
// RUN: | FileCheck %s
@@ -32,6 +32,31 @@ uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) {
return vmmlaq_u32(r, a, b);
}
+// CHECK-LABEL: define dso_local <8 x half> @test_vmmlaq_f16_mf8(
+// CHECK-SAME: <8 x half> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[P0]] to <8 x i16>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
+// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT: [[FMMLA1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16(<8 x half> [[FMMLA_I]], <16 x i8> [[P1]], <16 x i8> [[P2]])
+// CHECK-NEXT: ret <8 x half> [[FMMLA1_I]]
+//
+float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
+ return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8(
+// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
+// CHECK-NEXT: [[FMMLA_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32(<4 x float> [[P0]], <16 x i8> [[P1]], <16 x i8> [[P2]])
+// CHECK-NEXT: ret <4 x float> [[FMMLA_I]]
+//
+float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
+ return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3);
+}
+
// CHECK-LABEL: define dso_local <4 x i32> @test_vusmmlaq_s32(
// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index b81edc385cd43..78a60e839775e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -217,6 +217,11 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
: DefaultAttrsIntrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
[IntrNoMem]>;
+
+ class AdvSIMD_MatMul_fpm_Intrinsic
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty],
+ [IntrNoMem]>;
}
// Arithmetic ops
@@ -499,6 +504,7 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
+ def int_aarch64_neon_fmmla : AdvSIMD_MatMul_fpm_Intrinsic;
def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
def int_aarch64_neon_bfmmla
|
Lukacma
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The left some comments for the frontend part, but your patch seems to be missing further lowering from LLVM-IR to assembly.
b08c7b5 to
525dd5f
Compare
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
525dd5f to
e45f4a6
Compare
fbc43c0 to
533acb2
Compare
This stack of pull requests is managed by Graphite. Learn more about stacking. |
Thanks, now added. |
Add support for the following new intrinsics: ``` float16x8_t vmmlaq_f16_mf8_fpm(float16x8_t, mfloat8x16_t, mfloat8x16_t, fpm_t); float32x4_t vmmlaq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, fpm_t); ```
Fix CR comments; don't create a new intrinsic, and split test files
Split testcase files
Add extra lowering
Make it work properly
Lukacma
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Remove blank space
0d8078f to
a1bd2d7
Compare
…ics (llvm#165431) Add support for the following new AArch64 Neon intrinsics: ``` float16x8_t vmmlaq_f16_mf8_fpm(float16x8_t, mfloat8x16_t, mfloat8x16_t, fpm_t); float32x4_t vmmlaq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, fpm_t); ```

Add support for the following new AArch64 Neon intrinsics: