Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions clang/include/clang/Basic/arm_neon.td
Original file line number Diff line number Diff line change
Expand Up @@ -1896,6 +1896,14 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f
def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>;
}

let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f16mm,neon" in {
def VMMLA_F16_MF8 : VInst<"vmmla_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
}

let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f32mm,neon" in {
def VMMLA_F32_MF8 : VInst<"vmmla_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
}

let TargetGuard = "i8mm,neon" in {
def VMMLA : SInst<"vmmla", "..(<<)(<<)", "QUiQi">;
def VUSMMLA : SInst<"vusmmla", "..(<<U)(<<)", "Qi">;
Expand Down
10 changes: 10 additions & 0 deletions clang/lib/CodeGen/TargetBuiltins/ARM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7624,6 +7624,16 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
}
case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm:
return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
{llvm::FixedVectorType::get(HalfTy, 8),
llvm::FixedVectorType::get(Int8Ty, 16)},
Ops, E, "fmmla");
case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm:
return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
{llvm::FixedVectorType::get(FloatTy, 4),
llvm::FixedVectorType::get(Int8Ty, 16)},
Ops, E, "fmmla");
case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
ExtractLow = true;
[[fallthrough]];
Expand Down
23 changes: 23 additions & 0 deletions clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v9.6a -target-feature +f8f16mm -target-feature +fp8 \
// RUN: -disable-O0-optnone -emit-llvm -o - %s \
// RUN: | opt -S -passes=mem2reg,sroa \
// RUN: | FileCheck %s

// REQUIRES: aarch64-registered-target

#include <arm_neon.h>

// CHECK-LABEL: define dso_local <8 x half> @test_vmmlaq_f16_mf8(
// CHECK-SAME: <8 x half> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[P0]] to <8 x i16>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
// CHECK-NEXT: [[FMMLA1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v16i8(<8 x half> [[FMMLA_I]], <16 x i8> [[P1]], <16 x i8> [[P2]])
// CHECK-NEXT: ret <8 x half> [[FMMLA1_I]]
//
float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3);
}
21 changes: 21 additions & 0 deletions clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v9.6a -target-feature +f8f32mm -target-feature +fp8 \
// RUN: -disable-O0-optnone -emit-llvm -o - %s \
// RUN: | opt -S -passes=mem2reg,sroa \
// RUN: | FileCheck %s

// REQUIRES: aarch64-registered-target

#include <arm_neon.h>

// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8(
// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
// CHECK-NEXT: [[FMMLA_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v16i8(<4 x float> [[P0]], <16 x i8> [[P1]], <16 x i8> [[P2]])
// CHECK-NEXT: ret <4 x float> [[FMMLA_I]]
//
float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3);
}

1 change: 1 addition & 0 deletions llvm/include/llvm/IR/IntrinsicsAArch64.td
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,7 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_fmmla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
def int_aarch64_neon_bfmmla
Expand Down
16 changes: 11 additions & 5 deletions llvm/lib/Target/AArch64/AArch64InstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -13292,18 +13292,24 @@ multiclass AtomicFPStore<bit R, bits<3> op0, string asm> {
def H : BaseAtomicFPStore<FPR16, 0b01, R, op0, asm>;
}

class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind>
class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind, list<dag> pattern>
: BaseSIMDThreeSameVectorTied<1, 1, {size, 0}, 0b11101,
V128, asm, ".16b", []> {
V128, asm, ".16b", pattern> {
let AsmString = !strconcat(asm, "{\t$Rd", kind, ", $Rn.16b, $Rm.16b",
"|", kind, "\t$Rd, $Rn, $Rm}");
}

multiclass SIMDThreeSameVectorFP8MatrixMul<string asm>{
def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h">{
multiclass SIMDThreeSameVectorFP8MatrixMul<string asm, SDPatternOperator OpNode>{
def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h",
[(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd),
(v16i8 V128:$Rn),
(v16i8 V128:$Rm)))]> {
let Predicates = [HasNEON, HasF8F16MM];
}
def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s">{
def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s",
[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
(v16i8 V128:$Rn),
(v16i8 V128:$Rm)))]> {
let Predicates = [HasNEON, HasF8F32MM];
}
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -11417,7 +11417,7 @@ let Predicates = [HasF16F32MM] in
defm FMMLA : SIMDThreeSameVectorFMLAWiden<"fmmla">;

let Uses = [FPMR, FPCR] in
defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">;
defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla", int_aarch64_neon_fmmla>;

//===----------------------------------------------------------------------===//
// Contention Management Hints (FEAT_CMH)
Expand Down
14 changes: 14 additions & 0 deletions llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f16mm < %s | FileCheck %s
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f16mm -global-isel < %s | FileCheck %s

define <8 x half> @fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: fmmla.v8f16.v16i8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmmla v0.8h, v1.16b, v2.16b
; CHECK-NEXT: ret
entry:
%vfmmla1.i = tail call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) #3
ret <8 x half> %vfmmla1.i
}

13 changes: 13 additions & 0 deletions llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f32mm < %s | FileCheck %s
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f32mm -global-isel < %s | FileCheck %s

define <4 x float> @fmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: fmmla.v4f32.v16i8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmmla v0.4s, v1.16b, v2.16b
; CHECK-NEXT: ret
entry:
%vfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %a, <16 x i8> %b) #3
ret <4 x float> %vfmmla1.i
}
Loading