Skip to content

Commit c9b7e94

Browse files
[AArch64] Add FP8 Neon intrinsics for dot-product
THis patch adds the following intrinsics: float16x4_t vdot_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpm) float16x8_t vdotq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) float16x4_t vdot_lane_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm) float16x4_t vdot_laneq_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm) float16x8_t vdotq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm) float16x8_t vdotq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
1 parent 1836ab5 commit c9b7e94

File tree

10 files changed

+424
-40
lines changed

10 files changed

+424
-40
lines changed

clang/include/clang/Basic/arm_neon.td

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2148,6 +2148,28 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
21482148
def VCVTNQ_F8_F16 : VInst<"vcvtq_mf8_f16_fpm", ".(>F)(>F)V", "Qm">;
21492149
}
21502150

2151+
let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot2,neon" in {
2152+
def VDOT_F16_MF8 : VInst<"vdot_f16_mf8_fpm", "(>F)(>F)..V", "m">;
2153+
def VDOTQ_F16_MF8 : VInst<"vdotq_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
2154+
2155+
def VDOT_LANE_F16_MF8 : VInst<"vdot_lane_f16_mf8_fpm", "(>F)(>F)..IV", "m", [ImmCheck<3, ImmCheck0_3, 0>]>;
2156+
def VDOT_LANEQ_F16_MF8 : VInst<"vdot_laneq_f16_mf8_fpm", "(>F)(>F).QIV", "m", [ImmCheck<3, ImmCheck0_7, 0>]>;
2157+
2158+
def VDOTQ_LANE_F16_MF8 : VInst<"vdotq_lane_f16_mf8_fpm", "(>F)(>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>;
2159+
def VDOTQ_LANEQ_F16_MF8 : VInst<"vdotq_laneq_f16_mf8_fpm", "(>F)(>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_7, 0>]>;
2160+
}
2161+
2162+
let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot4,neon" in {
2163+
def VDOT_F32_MF8 : VInst<"vdot_f32_mf8_fpm", "(>>F)(>>F)..V", "m">;
2164+
def VDOTQ_F32_MF8 : VInst<"vdotq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
2165+
2166+
def VDOT_LANE_F32_MF8 : VInst<"vdot_lane_f32_mf8_fpm", "(>>F)(>>F)..IV", "m", [ImmCheck<3, ImmCheck0_1, 0>]>;
2167+
def VDOT_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F).QIV", "m", [ImmCheck<3, ImmCheck0_3, 0>]>;
2168+
2169+
def VDOTQ_LANE_F32_MF8 : VInst<"vdotq_lane_f32_mf8_fpm", "(>>F)(>>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_1, 0>]>;
2170+
def VDOTQ_LANEQ_F32_MF8 : VInst<"vdotq_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>;
2171+
}
2172+
21512173
let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
21522174
def FAMIN : WInst<"vamin", "...", "fhQdQfQh">;
21532175
def FAMAX : WInst<"vamax", "...", "fhQdQfQh">;

clang/include/clang/Basic/arm_neon_incl.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ class Inst <string n, string p, string t, Operation o, list<ImmCheck> ch = []>{
302302
class SInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
303303
class IInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
304304
class WInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
305-
class VInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
305+
class VInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
306306

307307
// The following instruction classes are implemented via operators
308308
// instead of builtins. As such these declarations are only used for

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6875,6 +6875,25 @@ Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
68756875
return EmitNeonCall(F, Ops, name);
68766876
}
68776877

6878+
llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
6879+
unsigned IID, bool ExtendLane, llvm::Type *RetTy,
6880+
SmallVectorImpl<llvm::Value *> &Ops, unsigned ICEArguments,
6881+
const CallExpr *E, const char *name) {
6882+
6883+
const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
6884+
RetTy->getPrimitiveSizeInBits();
6885+
llvm::Type *Tys[] = {llvm::FixedVectorType::get(RetTy, ElemCount),
6886+
Ops[1]->getType()};
6887+
if (ExtendLane) {
6888+
auto *VT = llvm::FixedVectorType::get(Int8Ty, 16);
6889+
Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2],
6890+
Builder.getInt64(0));
6891+
}
6892+
llvm::Value *FPM =
6893+
EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
6894+
return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
6895+
}
6896+
68786897
Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
68796898
bool neg) {
68806899
int SV = cast<ConstantInt>(V)->getSExtValue();
@@ -12822,6 +12841,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
1282212841

1282312842
unsigned Int;
1282412843
bool ExtractLow = false;
12844+
bool ExtendLane = false;
1282512845
switch (BuiltinID) {
1282612846
default: return nullptr;
1282712847
case NEON::BI__builtin_neon_vbsl_v:
@@ -14089,6 +14109,33 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
1408914109
return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn2,
1409014110
Ty, Ops[1]->getType(), false, Ops, E, "vfcvtn2");
1409114111
}
14112+
14113+
case NEON::BI__builtin_neon_vdot_f16_mf8_fpm:
14114+
case NEON::BI__builtin_neon_vdotq_f16_mf8_fpm:
14115+
return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2, false, HalfTy,
14116+
Ops, ICEArguments, E, "fdot2");
14117+
case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm:
14118+
case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm:
14119+
ExtendLane = true;
14120+
LLVM_FALLTHROUGH;
14121+
case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm:
14122+
case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm:
14123+
return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2_lane,
14124+
ExtendLane, HalfTy, Ops, ICEArguments, E,
14125+
"fdot2_lane");
14126+
case NEON::BI__builtin_neon_vdot_f32_mf8_fpm:
14127+
case NEON::BI__builtin_neon_vdotq_f32_mf8_fpm:
14128+
return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4, false,
14129+
FloatTy, Ops, ICEArguments, E, "fdot4");
14130+
case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm:
14131+
case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm:
14132+
ExtendLane = true;
14133+
LLVM_FALLTHROUGH;
14134+
case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm:
14135+
case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm:
14136+
return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane,
14137+
ExtendLane, FloatTy, Ops, ICEArguments, E,
14138+
"fdot4_lane");
1409214139
case NEON::BI__builtin_neon_vamin_f16:
1409314140
case NEON::BI__builtin_neon_vaminq_f16:
1409414141
case NEON::BI__builtin_neon_vamin_f32:

clang/lib/CodeGen/CodeGenFunction.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4655,6 +4655,11 @@ class CodeGenFunction : public CodeGenTypeCache {
46554655
llvm::Type *Ty1, bool Extract,
46564656
SmallVectorImpl<llvm::Value *> &Ops,
46574657
const CallExpr *E, const char *name);
4658+
llvm::Value *EmitFP8NeonFDOTCall(unsigned IID, bool ExtendLane,
4659+
llvm::Type *RetTy,
4660+
SmallVectorImpl<llvm::Value *> &Ops,
4661+
unsigned ICEArguments, const CallExpr *E,
4662+
const char *name);
46584663
llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx,
46594664
const llvm::ElementCount &Count);
46604665
llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx);
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2+
3+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
4+
5+
// REQUIES: aarch64-registered-target
6+
7+
#include <arm_neon.h>
8+
9+
// CHECK-LABEL: define dso_local <4 x half> @test_vdot_f16(
10+
// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
11+
// CHECK-NEXT: [[ENTRY:.*:]]
12+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
13+
// CHECK-NEXT: [[FDOT21_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
14+
// CHECK-NEXT: ret <4 x half> [[FDOT21_I]]
15+
//
16+
float16x4_t test_vdot_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
17+
return vdot_f16_mf8_fpm(vd, vn, vm, fpmr);
18+
}
19+
20+
// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_f16(
21+
// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
22+
// CHECK-NEXT: [[ENTRY:.*:]]
23+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
24+
// CHECK-NEXT: [[FDOT21_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
25+
// CHECK-NEXT: ret <8 x half> [[FDOT21_I]]
26+
//
27+
float16x8_t test_vdotq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
28+
return vdotq_f16_mf8_fpm(vd, vn, vm, fpmr);
29+
}
30+
31+
// CHECK-LABEL: define dso_local <4 x half> @test_vdot_lane_f16(
32+
// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
33+
// CHECK-NEXT: [[ENTRY:.*:]]
34+
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
35+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
36+
// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3)
37+
// CHECK-NEXT: ret <4 x half> [[FDOT2_LANE1]]
38+
//
39+
float16x4_t test_vdot_lane_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
40+
return vdot_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr);
41+
}
42+
43+
// CHECK-LABEL: define dso_local <4 x half> @test_vdot_laneq_f16(
44+
// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
45+
// CHECK-NEXT: [[ENTRY:.*:]]
46+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
47+
// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
48+
// CHECK-NEXT: ret <4 x half> [[FDOT2_LANE1]]
49+
//
50+
float16x4_t test_vdot_laneq_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) {
51+
return vdot_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr);
52+
}
53+
54+
// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_lane_f16(
55+
// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
56+
// CHECK-NEXT: [[ENTRY:.*:]]
57+
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
58+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
59+
// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3)
60+
// CHECK-NEXT: ret <8 x half> [[FDOT2_LANE1]]
61+
//
62+
float16x8_t test_vdotq_lane_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) {
63+
return vdotq_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr);
64+
}
65+
66+
// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_laneq_f16(
67+
// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
68+
// CHECK-NEXT: [[ENTRY:.*:]]
69+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
70+
// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
71+
// CHECK-NEXT: ret <8 x half> [[FDOT2_LANE1]]
72+
//
73+
float16x8_t test_vdotq_laneq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
74+
return vdotq_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr);
75+
}
76+
77+
// CHECK-LABEL: define dso_local <2 x float> @test_vdot_f32(
78+
// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
79+
// CHECK-NEXT: [[ENTRY:.*:]]
80+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
81+
// CHECK-NEXT: [[FDOT4_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
82+
// CHECK-NEXT: ret <2 x float> [[FDOT4_I]]
83+
//
84+
float32x2_t test_vdot_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
85+
return vdot_f32_mf8_fpm(vd, vn, vm, fpmr);
86+
}
87+
88+
// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_f32(
89+
// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
90+
// CHECK-NEXT: [[ENTRY:.*:]]
91+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
92+
// CHECK-NEXT: [[FDOT4_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
93+
// CHECK-NEXT: ret <4 x float> [[FDOT4_I]]
94+
//
95+
float32x4_t test_vdotq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
96+
return vdotq_f32_mf8_fpm(vd, vn, vm, fpmr);
97+
}
98+
99+
// CHECK-LABEL: define dso_local <2 x float> @test_vdot_lane_f32(
100+
// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
101+
// CHECK-NEXT: [[ENTRY:.*:]]
102+
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
103+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
104+
// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
105+
// CHECK-NEXT: ret <2 x float> [[FDOT4_LANE]]
106+
//
107+
float32x2_t test_vdot_lane_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
108+
return vdot_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr);
109+
}
110+
111+
// CHECK-LABEL: define dso_local <2 x float> @test_vdot_laneq_f32(
112+
// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
113+
// CHECK-NEXT: [[ENTRY:.*:]]
114+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
115+
// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
116+
// CHECK-NEXT: ret <2 x float> [[FDOT4_LANE]]
117+
//
118+
float32x2_t test_vdot_laneq_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) {
119+
return vdot_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr);
120+
}
121+
122+
// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_lane_f32(
123+
// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
124+
// CHECK-NEXT: [[ENTRY:.*:]]
125+
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
126+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
127+
// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
128+
// CHECK-NEXT: ret <4 x float> [[FDOT4_LANE]]
129+
//
130+
float32x4_t test_vdotq_lane_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) {
131+
return vdotq_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr);
132+
}
133+
134+
// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_laneq_f32(
135+
// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
136+
// CHECK-NEXT: [[ENTRY:.*:]]
137+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
138+
// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
139+
// CHECK-NEXT: ret <4 x float> [[FDOT4_LANE]]
140+
//
141+
float32x4_t test_vdotq_laneq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
142+
return vdotq_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr);
143+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +faminmax -target-feature +fp8 -emit-llvm -verify %s -o /dev/null
2+
3+
// REQUIRES: aarch64-registered-target
4+
5+
#include <arm_neon.h>
6+
7+
void test_features(float16x4_t vd4, float16x8_t vd8, float32x4_t va4, float32x2_t va2,
8+
mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) {
9+
(void) vdot_f16_mf8_fpm(vd4, v8, v8, fpm);
10+
// expected-error@-1 {{'vdot_f16_mf8_fpm' requires target feature 'fp8dot2'}}
11+
(void) vdotq_f16_mf8_fpm(vd8, v16, v16, fpm);
12+
// expected-error@-1 {{'vdotq_f16_mf8_fpm' requires target feature 'fp8dot2'}}
13+
(void) vdot_lane_f16_mf8_fpm(vd4, v8, v8, 3, fpm);
14+
// expected-error@-1 {{'__builtin_neon_vdot_lane_f16_mf8_fpm' needs target feature fp8dot2,neon}}
15+
(void) vdot_laneq_f16_mf8_fpm(vd4, v8, v16, 7, fpm);
16+
// expected-error@-1 {{'__builtin_neon_vdot_laneq_f16_mf8_fpm' needs target feature fp8dot2,neon}}
17+
(void) vdotq_lane_f16_mf8_fpm(vd8, v16, v8, 3, fpm);
18+
// expected-error@-1 {{'__builtin_neon_vdotq_lane_f16_mf8_fpm' needs target feature fp8dot2,neon}}
19+
(void) vdotq_laneq_f16_mf8_fpm(vd8, v16, v16, 7, fpm);
20+
// expected-error@-1 {{'__builtin_neon_vdotq_laneq_f16_mf8_fpm' needs target feature fp8dot2,neon}}
21+
22+
(void) vdot_f32_mf8_fpm(va2, v8, v8, fpm);
23+
// expected-error@-1 {{'vdot_f32_mf8_fpm' requires target feature 'fp8dot4'}}
24+
(void) vdotq_f32_mf8_fpm(va4, v16, v16, fpm);
25+
// expected-error@-1 {{'vdotq_f32_mf8_fpm' requires target feature 'fp8dot4}}
26+
(void) vdot_lane_f32_mf8_fpm(va2, v8, v8, 1, fpm);
27+
// expected-error@-1 {{'__builtin_neon_vdot_lane_f32_mf8_fpm' needs target feature fp8dot4,neon}}
28+
(void) vdot_laneq_f32_mf8_fpm(va2, v8, v16, 3, fpm);
29+
// expected-error@-1 {{'__builtin_neon_vdot_laneq_f32_mf8_fpm' needs target feature fp8dot4,neon}}
30+
(void) vdotq_lane_f32_mf8_fpm(va4, v16, v8, 1, fpm);
31+
// expected-error@-1 {{'__builtin_neon_vdotq_lane_f32_mf8_fpm' needs target feature fp8dot4,neon}}
32+
(void) vdotq_laneq_f32_mf8_fpm(va4, v16, v16, 3, fpm);
33+
// expected-error@-1 {{'__builtin_neon_vdotq_laneq_f32_mf8_fpm' needs target feature fp8dot4,neon}}
34+
}
35+
36+
void test_imm(float16x4_t vd4, float16x8_t vd8, float32x2_t va2, float32x4_t va4,
37+
mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) {
38+
(void) vdot_lane_f16_mf8_fpm(vd4, v8, v8, -1, fpm);
39+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
40+
(void) vdot_laneq_f16_mf8_fpm(vd4, v8, v16, -1, fpm);
41+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 7]}}
42+
(void) vdotq_lane_f16_mf8_fpm(vd8, v16, v8, -1, fpm);
43+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
44+
(void) vdotq_laneq_f16_mf8_fpm(vd8, v16, v16, -1, fpm);
45+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 7]}}
46+
(void) vdot_lane_f32_mf8_fpm(va2, v8, v8, -1, fpm);
47+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}}
48+
(void) vdot_laneq_f32_mf8_fpm(va2, v8, v16, -1, fpm);
49+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
50+
(void) vdotq_lane_f32_mf8_fpm(va4, v16, v8, -1, fpm);
51+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}}
52+
(void) vdotq_laneq_f32_mf8_fpm(va4, v16, v16, -1, fpm);
53+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
54+
}

0 commit comments

Comments
 (0)