Skip to content

Commit 3e4db7f

Browse files
[AArch64] Add FP8 Neon intrinsics for dot-product
THis patch adds the following intrinsics: float16x4_t vdot_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpm) float16x8_t vdotq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) float16x4_t vdot_lane_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm) float16x4_t vdot_laneq_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm) float16x8_t vdotq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm) float16x8_t vdotq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
1 parent e65fc02 commit 3e4db7f

File tree

10 files changed

+427
-40
lines changed

10 files changed

+427
-40
lines changed

clang/include/clang/Basic/arm_neon.td

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2148,6 +2148,28 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
21482148
def VCVTNQ_F8_F16 : VInst<"vcvtq_mf8_f16_fpm", ".(>F)(>F)V", "Qm">;
21492149
}
21502150

2151+
let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot2,neon" in {
2152+
def VDOT_F16_MF8 : VInst<"vdot_f16_mf8_fpm", "(>F)(>F)..V", "m">;
2153+
def VDOTQ_F16_MF8 : VInst<"vdotq_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
2154+
2155+
def VDOT_LANE_F16_MF8 : VInst<"vdot_lane_f16_mf8_fpm", "(>F)(>F)..IV", "m", [ImmCheck<3, ImmCheck0_3, 0>]>;
2156+
def VDOT_LANEQ_F16_MF8 : VInst<"vdot_laneq_f16_mf8_fpm", "(>F)(>F).QIV", "m", [ImmCheck<3, ImmCheck0_7, 0>]>;
2157+
2158+
def VDOTQ_LANE_F16_MF8 : VInst<"vdotq_lane_f16_mf8_fpm", "(>F)(>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>;
2159+
def VDOTQ_LANEQ_F16_MF8 : VInst<"vdotq_laneq_f16_mf8_fpm", "(>F)(>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_7, 0>]>;
2160+
}
2161+
2162+
let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot4,neon" in {
2163+
def VDOT_F32_MF8 : VInst<"vdot_f32_mf8_fpm", "(>>F)(>>F)..V", "m">;
2164+
def VDOTQ_F32_MF8 : VInst<"vdotq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
2165+
2166+
def VDOT_LANE_F32_MF8 : VInst<"vdot_lane_f32_mf8_fpm", "(>>F)(>>F)..IV", "m", [ImmCheck<3, ImmCheck0_1, 0>]>;
2167+
def VDOT_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F).QIV", "m", [ImmCheck<3, ImmCheck0_3, 0>]>;
2168+
2169+
def VDOTQ_LANE_F32_MF8 : VInst<"vdotq_lane_f32_mf8_fpm", "(>>F)(>>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_1, 0>]>;
2170+
def VDOTQ_LANEQ_F32_MF8 : VInst<"vdotq_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>;
2171+
}
2172+
21512173
let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
21522174
def FAMIN : WInst<"vamin", "...", "fhQdQfQh">;
21532175
def FAMAX : WInst<"vamax", "...", "fhQdQfQh">;

clang/include/clang/Basic/arm_neon_incl.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ class Inst <string n, string p, string t, Operation o, list<ImmCheck> ch = []>{
302302
class SInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
303303
class IInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
304304
class WInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
305-
class VInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
305+
class VInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
306306

307307
// The following instruction classes are implemented via operators
308308
// instead of builtins. As such these declarations are only used for

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6875,6 +6875,25 @@ Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
68756875
return EmitNeonCall(F, Ops, name);
68766876
}
68776877

6878+
llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
6879+
unsigned IID, bool ExtendLane, llvm::Type *RetTy,
6880+
SmallVectorImpl<llvm::Value *> &Ops, unsigned ICEArguments,
6881+
const CallExpr *E, const char *name) {
6882+
6883+
const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
6884+
RetTy->getPrimitiveSizeInBits();
6885+
llvm::Type *Tys[] = {llvm::FixedVectorType::get(RetTy, ElemCount),
6886+
Ops[1]->getType()};
6887+
if (ExtendLane) {
6888+
auto *VT = llvm::FixedVectorType::get(Int8Ty, 16);
6889+
Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2],
6890+
Builder.getInt64(0));
6891+
}
6892+
llvm::Value *FPM =
6893+
EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
6894+
return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
6895+
}
6896+
68786897
Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
68796898
bool neg) {
68806899
int SV = cast<ConstantInt>(V)->getSExtValue();
@@ -12795,6 +12814,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
1279512814
return V;
1279612815

1279712816
unsigned Int;
12817+
bool ExtendLane = false;
1279812818
switch (BuiltinID) {
1279912819
default: return nullptr;
1280012820
case NEON::BI__builtin_neon_vbsl_v:
@@ -14121,6 +14141,33 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
1412114141
EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
1412214142
return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn2");
1412314143
}
14144+
14145+
case NEON::BI__builtin_neon_vdot_f16_mf8_fpm:
14146+
case NEON::BI__builtin_neon_vdotq_f16_mf8_fpm:
14147+
return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2, false, HalfTy,
14148+
Ops, ICEArguments, E, "fdot2");
14149+
case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm:
14150+
case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm:
14151+
ExtendLane = true;
14152+
LLVM_FALLTHROUGH;
14153+
case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm:
14154+
case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm:
14155+
return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2_lane,
14156+
ExtendLane, HalfTy, Ops, ICEArguments, E,
14157+
"fdot2_lane");
14158+
case NEON::BI__builtin_neon_vdot_f32_mf8_fpm:
14159+
case NEON::BI__builtin_neon_vdotq_f32_mf8_fpm:
14160+
return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4, false,
14161+
FloatTy, Ops, ICEArguments, E, "fdot4");
14162+
case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm:
14163+
case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm:
14164+
ExtendLane = true;
14165+
LLVM_FALLTHROUGH;
14166+
case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm:
14167+
case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm:
14168+
return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane,
14169+
ExtendLane, FloatTy, Ops, ICEArguments, E,
14170+
"fdot4_lane");
1412414171
case NEON::BI__builtin_neon_vamin_f16:
1412514172
case NEON::BI__builtin_neon_vaminq_f16:
1412614173
case NEON::BI__builtin_neon_vamin_f32:

clang/lib/CodeGen/CodeGenFunction.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4627,6 +4627,11 @@ class CodeGenFunction : public CodeGenTypeCache {
46274627
llvm::Value *EmitFP8NeonCall(llvm::Function *F,
46284628
SmallVectorImpl<llvm::Value *> &O,
46294629
llvm::Value *FPM, const char *name);
4630+
llvm::Value *EmitFP8NeonFDOTCall(unsigned IID, bool ExtendLane,
4631+
llvm::Type *RetTy,
4632+
SmallVectorImpl<llvm::Value *> &Ops,
4633+
unsigned ICEArguments, const CallExpr *E,
4634+
const char *name);
46304635
llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx,
46314636
const llvm::ElementCount &Count);
46324637
llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx);
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2+
3+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
4+
5+
// REQUIES: aarch64-registered-target
6+
7+
#include <arm_neon.h>
8+
9+
// CHECK-LABEL: define dso_local <4 x half> @test_vdot_f16(
10+
// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
11+
// CHECK-NEXT: [[ENTRY:.*:]]
12+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
13+
// CHECK-NEXT: [[FDOT21_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
14+
// CHECK-NEXT: ret <4 x half> [[FDOT21_I]]
15+
//
16+
float16x4_t test_vdot_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
17+
return vdot_f16_mf8_fpm(vd, vn, vm, fpmr);
18+
}
19+
20+
// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_f16(
21+
// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
22+
// CHECK-NEXT: [[ENTRY:.*:]]
23+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
24+
// CHECK-NEXT: [[FDOT21_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
25+
// CHECK-NEXT: ret <8 x half> [[FDOT21_I]]
26+
//
27+
float16x8_t test_vdotq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
28+
return vdotq_f16_mf8_fpm(vd, vn, vm, fpmr);
29+
}
30+
31+
// CHECK-LABEL: define dso_local <4 x half> @test_vdot_lane_f16(
32+
// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
33+
// CHECK-NEXT: [[ENTRY:.*:]]
34+
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
35+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
36+
// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3)
37+
// CHECK-NEXT: ret <4 x half> [[FDOT2_LANE1]]
38+
//
39+
float16x4_t test_vdot_lane_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
40+
return vdot_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr);
41+
}
42+
43+
// CHECK-LABEL: define dso_local <4 x half> @test_vdot_laneq_f16(
44+
// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
45+
// CHECK-NEXT: [[ENTRY:.*:]]
46+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
47+
// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
48+
// CHECK-NEXT: ret <4 x half> [[FDOT2_LANE1]]
49+
//
50+
float16x4_t test_vdot_laneq_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) {
51+
return vdot_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr);
52+
}
53+
54+
// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_lane_f16(
55+
// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
56+
// CHECK-NEXT: [[ENTRY:.*:]]
57+
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
58+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
59+
// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3)
60+
// CHECK-NEXT: ret <8 x half> [[FDOT2_LANE1]]
61+
//
62+
float16x8_t test_vdotq_lane_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) {
63+
return vdotq_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr);
64+
}
65+
66+
// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_laneq_f16(
67+
// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
68+
// CHECK-NEXT: [[ENTRY:.*:]]
69+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
70+
// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
71+
// CHECK-NEXT: ret <8 x half> [[FDOT2_LANE1]]
72+
//
73+
float16x8_t test_vdotq_laneq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
74+
return vdotq_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr);
75+
}
76+
77+
// CHECK-LABEL: define dso_local <2 x float> @test_vdot_f32(
78+
// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
79+
// CHECK-NEXT: [[ENTRY:.*:]]
80+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
81+
// CHECK-NEXT: [[FDOT4_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
82+
// CHECK-NEXT: ret <2 x float> [[FDOT4_I]]
83+
//
84+
float32x2_t test_vdot_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
85+
return vdot_f32_mf8_fpm(vd, vn, vm, fpmr);
86+
}
87+
88+
// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_f32(
89+
// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
90+
// CHECK-NEXT: [[ENTRY:.*:]]
91+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
92+
// CHECK-NEXT: [[FDOT4_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
93+
// CHECK-NEXT: ret <4 x float> [[FDOT4_I]]
94+
//
95+
float32x4_t test_vdotq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
96+
return vdotq_f32_mf8_fpm(vd, vn, vm, fpmr);
97+
}
98+
99+
// CHECK-LABEL: define dso_local <2 x float> @test_vdot_lane_f32(
100+
// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
101+
// CHECK-NEXT: [[ENTRY:.*:]]
102+
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
103+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
104+
// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
105+
// CHECK-NEXT: ret <2 x float> [[FDOT4_LANE]]
106+
//
107+
float32x2_t test_vdot_lane_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
108+
return vdot_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr);
109+
}
110+
111+
// CHECK-LABEL: define dso_local <2 x float> @test_vdot_laneq_f32(
112+
// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
113+
// CHECK-NEXT: [[ENTRY:.*:]]
114+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
115+
// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
116+
// CHECK-NEXT: ret <2 x float> [[FDOT4_LANE]]
117+
//
118+
float32x2_t test_vdot_laneq_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) {
119+
return vdot_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr);
120+
}
121+
122+
// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_lane_f32(
123+
// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
124+
// CHECK-NEXT: [[ENTRY:.*:]]
125+
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
126+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
127+
// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
128+
// CHECK-NEXT: ret <4 x float> [[FDOT4_LANE]]
129+
//
130+
float32x4_t test_vdotq_lane_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) {
131+
return vdotq_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr);
132+
}
133+
134+
// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_laneq_f32(
135+
// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
136+
// CHECK-NEXT: [[ENTRY:.*:]]
137+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
138+
// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
139+
// CHECK-NEXT: ret <4 x float> [[FDOT4_LANE]]
140+
//
141+
float32x4_t test_vdotq_laneq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
142+
return vdotq_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr);
143+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +fp8 -emit-llvm -verify %s -o /dev/null
2+
3+
#include <arm_neon.h>
4+
// REQUIRES: aarch64-registered-target
5+
6+
7+
void test_features(float16x4_t vd4, float16x8_t vd8, float32x2_t va2,
8+
float32x4_t va4, mfloat8x8_t v8, mfloat8x16_t v16,
9+
fpm_t fpmr) {
10+
(void) vdot_f16_mf8_fpm(vd4, v8, v8, fpmr);
11+
// expected-error@-1 {{'vdot_f16_mf8_fpm' requires target feature 'fp8dot2'}}
12+
(void) vdotq_f16_mf8_fpm(vd8, v16, v16, fpmr);
13+
// expected-error@-1 {{'vdotq_f16_mf8_fpm' requires target feature 'fp8dot2'}}
14+
(void) vdot_lane_f16_mf8_fpm(vd4, v8, v8, 3, fpmr);
15+
// expected-error@-1 {{'__builtin_neon_vdot_lane_f16_mf8_fpm' needs target feature fp8dot2,neon}}
16+
(void) vdot_laneq_f16_mf8_fpm(vd4, v8, v16, 7, fpmr);
17+
// expected-error@-1 {{'__builtin_neon_vdot_laneq_f16_mf8_fpm' needs target feature fp8dot2,neon}}
18+
(void) vdotq_lane_f16_mf8_fpm(vd8, v16, v8, 3, fpmr);
19+
// expected-error@-1 {{'__builtin_neon_vdotq_lane_f16_mf8_fpm' needs target feature fp8dot2,neon}}
20+
(void) vdotq_laneq_f16_mf8_fpm(vd8, v16, v16, 7, fpmr);
21+
// expected-error@-1 {{'__builtin_neon_vdotq_laneq_f16_mf8_fpm' needs target feature fp8dot2,neon}}
22+
23+
(void) vdot_f32_mf8_fpm(va2, v8, v8, fpmr);
24+
// expected-error@-1 {{'vdot_f32_mf8_fpm' requires target feature 'fp8dot4'}}
25+
(void) vdotq_f32_mf8_fpm(va4, v16, v16, fpmr);
26+
// expected-error@-1 {{'vdotq_f32_mf8_fpm' requires target feature 'fp8dot4}}
27+
(void) vdot_lane_f32_mf8_fpm(va2, v8, v8, 1, fpmr);
28+
// expected-error@-1 {{'__builtin_neon_vdot_lane_f32_mf8_fpm' needs target feature fp8dot4,neon}}
29+
(void) vdot_laneq_f32_mf8_fpm(va2, v8, v16, 3, fpmr);
30+
// expected-error@-1 {{'__builtin_neon_vdot_laneq_f32_mf8_fpm' needs target feature fp8dot4,neon}}
31+
(void) vdotq_lane_f32_mf8_fpm(va4, v16, v8, 1, fpmr);
32+
// expected-error@-1 {{'__builtin_neon_vdotq_lane_f32_mf8_fpm' needs target feature fp8dot4,neon}}
33+
(void) vdotq_laneq_f32_mf8_fpm(va4, v16, v16, 3, fpmr);
34+
// expected-error@-1 {{'__builtin_neon_vdotq_laneq_f32_mf8_fpm' needs target feature fp8dot4,neon}}
35+
}
36+
37+
void test_imm(float16x4_t vd4, float16x8_t vd8, float32x2_t va2, float32x4_t va4,
38+
mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpmr) {
39+
(void) vdot_lane_f16_mf8_fpm(vd4, v8, v8, -1, fpmr);
40+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
41+
(void) vdot_laneq_f16_mf8_fpm(vd4, v8, v16, -1, fpmr);
42+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 7]}}
43+
(void) vdotq_lane_f16_mf8_fpm(vd8, v16, v8, -1, fpmr);
44+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
45+
(void) vdotq_laneq_f16_mf8_fpm(vd8, v16, v16, -1, fpmr);
46+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 7]}}
47+
(void) vdot_lane_f32_mf8_fpm(va2, v8, v8, -1, fpmr);
48+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}}
49+
(void) vdot_laneq_f32_mf8_fpm(va2, v8, v16, -1, fpmr);
50+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
51+
(void) vdotq_lane_f32_mf8_fpm(va4, v16, v8, -1, fpmr);
52+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}}
53+
(void) vdotq_laneq_f32_mf8_fpm(va4, v16, v16, -1, fpmr);
54+
// expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
55+
}
56+
57+

0 commit comments

Comments
 (0)