Skip to content

Commit 7377ac0

Browse files
authored
[AArch64][llvm] Add support for Neon vmmlaq_{f16,f32}_mf8_fpm intrinsics (#165431)
Add support for the following new AArch64 Neon intrinsics: ``` float16x8_t vmmlaq_f16_mf8_fpm(float16x8_t, mfloat8x16_t, mfloat8x16_t, fpm_t); float32x4_t vmmlaq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, fpm_t); ```
1 parent bcb1b77 commit 7377ac0

File tree

9 files changed

+102
-6
lines changed

9 files changed

+102
-6
lines changed

clang/include/clang/Basic/arm_neon.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1921,6 +1921,14 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f
19211921
def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>;
19221922
}
19231923

1924+
let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f16mm,neon" in {
1925+
def VMMLA_F16_MF8 : VInst<"vmmla_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
1926+
}
1927+
1928+
let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f32mm,neon" in {
1929+
def VMMLA_F32_MF8 : VInst<"vmmla_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
1930+
}
1931+
19241932
let TargetGuard = "i8mm,neon" in {
19251933
def VMMLA : SInst<"vmmla", "..(<<)(<<)", "QUiQi">;
19261934
def VUSMMLA : SInst<"vusmmla", "..(<<U)(<<)", "Qi">;

clang/lib/CodeGen/TargetBuiltins/ARM.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7646,6 +7646,16 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
76467646
Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
76477647
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
76487648
}
7649+
case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm:
7650+
return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
7651+
{llvm::FixedVectorType::get(HalfTy, 8),
7652+
llvm::FixedVectorType::get(Int8Ty, 16)},
7653+
Ops, E, "fmmla");
7654+
case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm:
7655+
return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
7656+
{llvm::FixedVectorType::get(FloatTy, 4),
7657+
llvm::FixedVectorType::get(Int8Ty, 16)},
7658+
Ops, E, "fmmla");
76497659
case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
76507660
ExtractLow = true;
76517661
[[fallthrough]];
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2+
// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v9.6a -target-feature +f8f16mm -target-feature +fp8 \
3+
// RUN: -disable-O0-optnone -emit-llvm -o - %s \
4+
// RUN: | opt -S -passes=mem2reg,sroa \
5+
// RUN: | FileCheck %s
6+
7+
// REQUIRES: aarch64-registered-target
8+
9+
#include <arm_neon.h>
10+
11+
// CHECK-LABEL: define dso_local <8 x half> @test_vmmlaq_f16_mf8(
12+
// CHECK-SAME: <8 x half> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0:[0-9]+]] {
13+
// CHECK-NEXT: [[ENTRY:.*:]]
14+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[P0]] to <8 x i16>
15+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
16+
// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
17+
// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
18+
// CHECK-NEXT: [[FMMLA1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v16i8(<8 x half> [[FMMLA_I]], <16 x i8> [[P1]], <16 x i8> [[P2]])
19+
// CHECK-NEXT: ret <8 x half> [[FMMLA1_I]]
20+
//
21+
float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
22+
return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3);
23+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2+
// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v9.6a -target-feature +f8f32mm -target-feature +fp8 \
3+
// RUN: -disable-O0-optnone -emit-llvm -o - %s \
4+
// RUN: | opt -S -passes=mem2reg,sroa \
5+
// RUN: | FileCheck %s
6+
7+
// REQUIRES: aarch64-registered-target
8+
9+
#include <arm_neon.h>
10+
11+
// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8(
12+
// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0:[0-9]+]] {
13+
// CHECK-NEXT: [[ENTRY:.*:]]
14+
// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
15+
// CHECK-NEXT: [[FMMLA_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v16i8(<4 x float> [[P0]], <16 x i8> [[P1]], <16 x i8> [[P2]])
16+
// CHECK-NEXT: ret <4 x float> [[FMMLA_I]]
17+
//
18+
float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
19+
return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3);
20+
}
21+

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,7 @@ let TargetPrefix = "aarch64" in {
499499
def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
500500
def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
501501
def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
502+
def int_aarch64_neon_fmmla : AdvSIMD_MatMul_Intrinsic;
502503
def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
503504
def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
504505
def int_aarch64_neon_bfmmla

llvm/lib/Target/AArch64/AArch64InstrFormats.td

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13292,18 +13292,24 @@ multiclass AtomicFPStore<bit R, bits<3> op0, string asm> {
1329213292
def H : BaseAtomicFPStore<FPR16, 0b01, R, op0, asm>;
1329313293
}
1329413294

13295-
class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind>
13295+
class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind, list<dag> pattern>
1329613296
: BaseSIMDThreeSameVectorTied<1, 1, {size, 0}, 0b11101,
13297-
V128, asm, ".16b", []> {
13297+
V128, asm, ".16b", pattern> {
1329813298
let AsmString = !strconcat(asm, "{\t$Rd", kind, ", $Rn.16b, $Rm.16b",
1329913299
"|", kind, "\t$Rd, $Rn, $Rm}");
1330013300
}
1330113301

13302-
multiclass SIMDThreeSameVectorFP8MatrixMul<string asm>{
13303-
def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h">{
13302+
multiclass SIMDThreeSameVectorFP8MatrixMul<string asm, SDPatternOperator OpNode>{
13303+
def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h",
13304+
[(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd),
13305+
(v16i8 V128:$Rn),
13306+
(v16i8 V128:$Rm)))]> {
1330413307
let Predicates = [HasNEON, HasF8F16MM];
1330513308
}
13306-
def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s">{
13309+
def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s",
13310+
[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
13311+
(v16i8 V128:$Rn),
13312+
(v16i8 V128:$Rm)))]> {
1330713313
let Predicates = [HasNEON, HasF8F32MM];
1330813314
}
1330913315
}

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11417,7 +11417,7 @@ let Predicates = [HasF16F32MM] in
1141711417
defm FMMLA : SIMDThreeSameVectorFMLAWiden<"fmmla">;
1141811418

1141911419
let Uses = [FPMR, FPCR] in
11420-
defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">;
11420+
defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla", int_aarch64_neon_fmmla>;
1142111421

1142211422
//===----------------------------------------------------------------------===//
1142311423
// Contention Management Hints (FEAT_CMH)
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f16mm < %s | FileCheck %s
3+
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f16mm -global-isel < %s | FileCheck %s
4+
5+
define <8 x half> @fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) {
6+
; CHECK-LABEL: fmmla.v8f16.v16i8:
7+
; CHECK: // %bb.0: // %entry
8+
; CHECK-NEXT: fmmla v0.8h, v1.16b, v2.16b
9+
; CHECK-NEXT: ret
10+
entry:
11+
%vfmmla1.i = tail call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) #3
12+
ret <8 x half> %vfmmla1.i
13+
}
14+
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f32mm < %s | FileCheck %s
3+
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f32mm -global-isel < %s | FileCheck %s
4+
5+
define <4 x float> @fmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %a, <16 x i8> %b) {
6+
; CHECK-LABEL: fmmla.v4f32.v16i8:
7+
; CHECK: // %bb.0: // %entry
8+
; CHECK-NEXT: fmmla v0.4s, v1.16b, v2.16b
9+
; CHECK-NEXT: ret
10+
entry:
11+
%vfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %a, <16 x i8> %b) #3
12+
ret <4 x float> %vfmmla1.i
13+
}

0 commit comments

Comments
 (0)