Skip to content

Commit 26395ce

Browse files
[AArch64] Add FP8 Neon intrinsics for dot-product
THis patch adds the following intrinsics: float16x4_t vdot_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpm) float16x8_t vdotq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) float16x4_t vdot_lane_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm) float16x4_t vdot_laneq_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm) float16x8_t vdotq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm) float16x8_t vdotq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
1 parent 997ee58 commit 26395ce

File tree

10 files changed

+424
-40
lines changed

10 files changed

+424
-40
lines changed

clang/include/clang/Basic/arm_neon.td

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2147,6 +2147,28 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
21472147
def VCVTN_F8_F16 : VInst<"vcvt_mf8_f16_fpm", ".(>F)(>F)V", "mQm">;
21482148
}
21492149

2150+
let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot2,neon" in {
2151+
def VDOT_F16_MF8 : VInst<"vdot_f16_mf8_fpm", "(>F)(>F)..V", "m">;
2152+
def VDOTQ_F16_MF8 : VInst<"vdotq_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
2153+
2154+
def VDOT_LANE_F16_MF8 : VInst<"vdot_lane_f16_mf8_fpm", "(>F)(>F)..IV", "m", [ImmCheck<3, ImmCheck0_3, 0>]>;
2155+
def VDOT_LANEQ_F16_MF8 : VInst<"vdot_laneq_f16_mf8_fpm", "(>F)(>F).QIV", "m", [ImmCheck<3, ImmCheck0_7, 0>]>;
2156+
2157+
def VDOTQ_LANE_F16_MF8 : VInst<"vdotq_lane_f16_mf8_fpm", "(>F)(>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>;
2158+
def VDOTQ_LANEQ_F16_MF8 : VInst<"vdotq_laneq_f16_mf8_fpm", "(>F)(>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_7, 0>]>;
2159+
}
2160+
2161+
let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot4,neon" in {
2162+
def VDOT_F32_MF8 : VInst<"vdot_f32_mf8_fpm", "(>>F)(>>F)..V", "m">;
2163+
def VDOTQ_F32_MF8 : VInst<"vdotq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
2164+
2165+
def VDOT_LANE_F32_MF8 : VInst<"vdot_lane_f32_mf8_fpm", "(>>F)(>>F)..IV", "m", [ImmCheck<3, ImmCheck0_1, 0>]>;
2166+
def VDOT_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F).QIV", "m", [ImmCheck<3, ImmCheck0_3, 0>]>;
2167+
2168+
def VDOTQ_LANE_F32_MF8 : VInst<"vdotq_lane_f32_mf8_fpm", "(>>F)(>>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_1, 0>]>;
2169+
def VDOTQ_LANEQ_F32_MF8 : VInst<"vdotq_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>;
2170+
}
2171+
21502172
let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
21512173
def FAMIN : WInst<"vamin", "...", "fhQdQfQh">;
21522174
def FAMAX : WInst<"vamax", "...", "fhQdQfQh">;

clang/include/clang/Basic/arm_neon_incl.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ class Inst <string n, string p, string t, Operation o, list<ImmCheck> ch = []>{
302302
class SInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
303303
class IInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
304304
class WInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
305-
class VInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
305+
class VInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
306306

307307
// The following instruction classes are implemented via operators
308308
// instead of builtins. As such these declarations are only used for

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6915,6 +6915,25 @@ Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
69156915
return EmitNeonCall(F, Ops, name);
69166916
}
69176917

6918+
llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
6919+
unsigned IID, bool ExtendLane, llvm::Type *RetTy,
6920+
SmallVectorImpl<llvm::Value *> &Ops, unsigned ICEArguments,
6921+
const CallExpr *E, const char *name) {
6922+
6923+
const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
6924+
RetTy->getPrimitiveSizeInBits();
6925+
llvm::Type *Tys[] = {llvm::FixedVectorType::get(RetTy, ElemCount),
6926+
Ops[1]->getType()};
6927+
if (ExtendLane) {
6928+
auto *VT = llvm::FixedVectorType::get(Int8Ty, 16);
6929+
Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2],
6930+
Builder.getInt64(0));
6931+
}
6932+
llvm::Value *FPM =
6933+
EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
6934+
return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
6935+
}
6936+
69186937
Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
69196938
bool neg) {
69206939
int SV = cast<ConstantInt>(V)->getSExtValue();
@@ -12892,6 +12911,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
1289212911

1289312912
unsigned Int;
1289412913
bool ExtractLow = false;
12914+
bool ExtendLane = false;
1289512915
switch (BuiltinID) {
1289612916
default: return nullptr;
1289712917
case NEON::BI__builtin_neon_vbsl_v:
@@ -14159,6 +14179,33 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
1415914179
return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn2,
1416014180
Ty, Ops[1]->getType(), false, Ops, E, "vfcvtn2");
1416114181
}
14182+
14183+
case NEON::BI__builtin_neon_vdot_f16_mf8_fpm:
14184+
case NEON::BI__builtin_neon_vdotq_f16_mf8_fpm:
14185+
return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2, false, HalfTy,
14186+
Ops, ICEArguments, E, "fdot2");
14187+
case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm:
14188+
case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm:
14189+
ExtendLane = true;
14190+
LLVM_FALLTHROUGH;
14191+
case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm:
14192+
case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm:
14193+
return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2_lane,
14194+
ExtendLane, HalfTy, Ops, ICEArguments, E,
14195+
"fdot2_lane");
14196+
case NEON::BI__builtin_neon_vdot_f32_mf8_fpm:
14197+
case NEON::BI__builtin_neon_vdotq_f32_mf8_fpm:
14198+
return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4, false,
14199+
FloatTy, Ops, ICEArguments, E, "fdot4");
14200+
case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm:
14201+
case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm:
14202+
ExtendLane = true;
14203+
LLVM_FALLTHROUGH;
14204+
case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm:
14205+
case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm:
14206+
return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane,
14207+
ExtendLane, FloatTy, Ops, ICEArguments, E,
14208+
"fdot4_lane");
1416214209
case NEON::BI__builtin_neon_vamin_f16:
1416314210
case NEON::BI__builtin_neon_vaminq_f16:
1416414211
case NEON::BI__builtin_neon_vamin_f32:

clang/lib/CodeGen/CodeGenFunction.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4670,6 +4670,11 @@ class CodeGenFunction : public CodeGenTypeCache {
46704670
llvm::Type *Ty1, bool Extract,
46714671
SmallVectorImpl<llvm::Value *> &Ops,
46724672
const CallExpr *E, const char *name);
4673+
llvm::Value *EmitFP8NeonFDOTCall(unsigned IID, bool ExtendLane,
4674+
llvm::Type *RetTy,
4675+
SmallVectorImpl<llvm::Value *> &Ops,
4676+
unsigned ICEArguments, const CallExpr *E,
4677+
const char *name);
46734678
llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx,
46744679
const llvm::ElementCount &Count);
46754680
llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx);
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2+
3+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
4+
5+
// REQUIES: aarch64-registered-target
6+
7+
#include <arm_neon.h>
8+
9+
// CHECK-LABEL: define dso_local <4 x half> @test_vdot_f16(
10+
// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
11+
// CHECK-NEXT: [[ENTRY:.*:]]
12+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
13+
// CHECK-NEXT: [[FDOT21_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
14+
// CHECK-NEXT: ret <4 x half> [[FDOT21_I]]
15+
//
16+
float16x4_t test_vdot_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
17+
return vdot_f16_mf8_fpm(vd, vn, vm, fpmr);
18+
}
19+
20+
// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_f16(
21+
// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
22+
// CHECK-NEXT: [[ENTRY:.*:]]
23+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
24+
// CHECK-NEXT: [[FDOT21_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
25+
// CHECK-NEXT: ret <8 x half> [[FDOT21_I]]
26+
//
27+
float16x8_t test_vdotq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
28+
return vdotq_f16_mf8_fpm(vd, vn, vm, fpmr);
29+
}
30+
31+
// CHECK-LABEL: define dso_local <4 x half> @test_vdot_lane_f16(
32+
// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
33+
// CHECK-NEXT: [[ENTRY:.*:]]
34+
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
35+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
36+
// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3)
37+
// CHECK-NEXT: ret <4 x half> [[FDOT2_LANE1]]
38+
//
39+
float16x4_t test_vdot_lane_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
40+
return vdot_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr);
41+
}
42+
43+
// CHECK-LABEL: define dso_local <4 x half> @test_vdot_laneq_f16(
44+
// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
45+
// CHECK-NEXT: [[ENTRY:.*:]]
46+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
47+
// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
48+
// CHECK-NEXT: ret <4 x half> [[FDOT2_LANE1]]
49+
//
50+
float16x4_t test_vdot_laneq_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) {
51+
return vdot_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr);
52+
}
53+
54+
// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_lane_f16(
55+
// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
56+
// CHECK-NEXT: [[ENTRY:.*:]]
57+
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
58+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
59+
// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3)
60+
// CHECK-NEXT: ret <8 x half> [[FDOT2_LANE1]]
61+
//
62+
float16x8_t test_vdotq_lane_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) {
63+
return vdotq_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr);
64+
}
65+
66+
// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_laneq_f16(
67+
// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
68+
// CHECK-NEXT: [[ENTRY:.*:]]
69+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
70+
// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
71+
// CHECK-NEXT: ret <8 x half> [[FDOT2_LANE1]]
72+
//
73+
float16x8_t test_vdotq_laneq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
74+
return vdotq_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr);
75+
}
76+
77+
// CHECK-LABEL: define dso_local <2 x float> @test_vdot_f32(
78+
// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
79+
// CHECK-NEXT: [[ENTRY:.*:]]
80+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
81+
// CHECK-NEXT: [[FDOT4_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
82+
// CHECK-NEXT: ret <2 x float> [[FDOT4_I]]
83+
//
84+
float32x2_t test_vdot_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
85+
return vdot_f32_mf8_fpm(vd, vn, vm, fpmr);
86+
}
87+
88+
// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_f32(
89+
// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
90+
// CHECK-NEXT: [[ENTRY:.*:]]
91+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
92+
// CHECK-NEXT: [[FDOT4_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
93+
// CHECK-NEXT: ret <4 x float> [[FDOT4_I]]
94+
//
95+
float32x4_t test_vdotq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
96+
return vdotq_f32_mf8_fpm(vd, vn, vm, fpmr);
97+
}
98+
99+
// CHECK-LABEL: define dso_local <2 x float> @test_vdot_lane_f32(
100+
// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
101+
// CHECK-NEXT: [[ENTRY:.*:]]
102+
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
103+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
104+
// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
105+
// CHECK-NEXT: ret <2 x float> [[FDOT4_LANE]]
106+
//
107+
float32x2_t test_vdot_lane_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
108+
return vdot_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr);
109+
}
110+
111+
// CHECK-LABEL: define dso_local <2 x float> @test_vdot_laneq_f32(
112+
// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
113+
// CHECK-NEXT: [[ENTRY:.*:]]
114+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
115+
// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
116+
// CHECK-NEXT: ret <2 x float> [[FDOT4_LANE]]
117+
//
118+
float32x2_t test_vdot_laneq_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) {
119+
return vdot_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr);
120+
}
121+
122+
// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_lane_f32(
123+
// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
124+
// CHECK-NEXT: [[ENTRY:.*:]]
125+
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
126+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
127+
// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
128+
// CHECK-NEXT: ret <4 x float> [[FDOT4_LANE]]
129+
//
130+
float32x4_t test_vdotq_lane_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) {
131+
return vdotq_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr);
132+
}
133+
134+
// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_laneq_f32(
135+
// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
136+
// CHECK-NEXT: [[ENTRY:.*:]]
137+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
138+
// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
139+
// CHECK-NEXT: ret <4 x float> [[FDOT4_LANE]]
140+
//
141+
float32x4_t test_vdotq_laneq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
142+
return vdotq_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr);
143+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +faminmax -target-feature +fp8 -emit-llvm -verify %s -o /dev/null
2+
3+
// REQUIRES: aarch64-registered-target
4+
5+
#include <arm_neon.h>
6+
7+
void test_features(float16x4_t vd4, float16x8_t vd8, float32x4_t va4, float32x2_t va2,
8+
mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) {
9+
(void) vdot_f16_mf8_fpm(vd4, v8, v8, fpm);
10+
// expected-error@-1 {{'vdot_f16_mf8_fpm' requires target feature 'fp8dot2'}}
11+
(void) vdotq_f16_mf8_fpm(vd8, v16, v16, fpm);
12+
// expected-error@-1 {{'vdotq_f16_mf8_fpm' requires target feature 'fp8dot2'}}
13+
(void) vdot_lane_f16_mf8_fpm(vd4, v8, v8, 3, fpm);
14+
// expected-error@-1 {{'__builtin_neon_vdot_lane_f16_mf8_fpm' needs target feature fp8dot2,neon}}
15+
(void) vdot_laneq_f16_mf8_fpm(vd4, v8, v16, 7, fpm);
16+
// expected-error@-1 {{'__builtin_neon_vdot_laneq_f16_mf8_fpm' needs target feature fp8dot2,neon}}
17+
(void) vdotq_lane_f16_mf8_fpm(vd8, v16, v8, 3, fpm);
18+
// expected-error@-1 {{'__builtin_neon_vdotq_lane_f16_mf8_fpm' needs target feature fp8dot2,neon}}
19+
(void) vdotq_laneq_f16_mf8_fpm(vd8, v16, v16, 7, fpm);
20+
// expected-error@-1 {{'__builtin_neon_vdotq_laneq_f16_mf8_fpm' needs target feature fp8dot2,neon}}
21+
22+
(void) vdot_f32_mf8_fpm(va2, v8, v8, fpm);
23+
// expected-error@-1 {{'vdot_f32_mf8_fpm' requires target feature 'fp8dot4'}}
24+
(void) vdotq_f32_mf8_fpm(va4, v16, v16, fpm);
25+
// expected-error@-1 {{'vdotq_f32_mf8_fpm' requires target feature 'fp8dot4}}
26+
(void) vdot_lane_f32_mf8_fpm(va2, v8, v8, 1, fpm);
27+
// expected-error@-1 {{'__builtin_neon_vdot_lane_f32_mf8_fpm' needs target feature fp8dot4,neon}}
28+
(void) vdot_laneq_f32_mf8_fpm(va2, v8, v16, 3, fpm);
29+
// expected-error@-1 {{'__builtin_neon_vdot_laneq_f32_mf8_fpm' needs target feature fp8dot4,neon}}
30+
(void) vdotq_lane_f32_mf8_fpm(va4, v16, v8, 1, fpm);
31+
// expected-error@-1 {{'__builtin_neon_vdotq_lane_f32_mf8_fpm' needs target feature fp8dot4,neon}}
32+
(void) vdotq_laneq_f32_mf8_fpm(va4, v16, v16, 3, fpm);
33+
// expected-error@-1 {{'__builtin_neon_vdotq_laneq_f32_mf8_fpm' needs target feature fp8dot4,neon}}
34+
}
35+
36+
void test_imm(float16x4_t vd4, float16x8_t vd8, float32x2_t va2, float32x4_t va4,
37+
mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) {
38+
(void) vdot_lane_f16_mf8_fpm(vd4, v8, v8, -1, fpm);
39+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
40+
(void) vdot_laneq_f16_mf8_fpm(vd4, v8, v16, -1, fpm);
41+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 7]}}
42+
(void) vdotq_lane_f16_mf8_fpm(vd8, v16, v8, -1, fpm);
43+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
44+
(void) vdotq_laneq_f16_mf8_fpm(vd8, v16, v16, -1, fpm);
45+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 7]}}
46+
(void) vdot_lane_f32_mf8_fpm(va2, v8, v8, -1, fpm);
47+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}}
48+
(void) vdot_laneq_f32_mf8_fpm(va2, v8, v16, -1, fpm);
49+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
50+
(void) vdotq_lane_f32_mf8_fpm(va4, v16, v8, -1, fpm);
51+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}}
52+
(void) vdotq_laneq_f32_mf8_fpm(va4, v16, v16, -1, fpm);
53+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
54+
}

0 commit comments

Comments
 (0)