Skip to content

Commit 525dd5f

Browse files
committed
fixup! [AArch64][llvm] Add support for vmmlaq_[f16,f32]_mf8 intrinsics
Fix CR comments; don't create a new intrinsic, and split test files
1 parent ad6aadb commit 525dd5f

File tree

4 files changed

+43
-34
lines changed

4 files changed

+43
-34
lines changed

clang/lib/CodeGen/TargetBuiltins/ARM.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7795,11 +7795,11 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
77957795
}
77967796
case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm:
77977797
return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
7798-
{llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
7798+
{llvm::FixedVectorType::get(HalfTy, 8), llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
77997799
"fmmla");
78007800
case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm:
78017801
return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
7802-
{llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
7802+
{llvm::FixedVectorType::get(FloatTy, 4), llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
78037803
"fmmla");
78047804
case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
78057805
ExtractLow = true;

clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2-
// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm -target-feature +f8f16mm -target-feature +f8f32mm -target-feature +fp8 \
2+
// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \
33
// RUN: -disable-O0-optnone -emit-llvm -o - %s \
44
// RUN: | opt -S -passes=mem2reg,sroa \
55
// RUN: | FileCheck %s
@@ -32,31 +32,6 @@ uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) {
3232
return vmmlaq_u32(r, a, b);
3333
}
3434

35-
// CHECK-LABEL: define dso_local <8 x half> @test_vmmlaq_f16_mf8(
36-
// CHECK-SAME: <8 x half> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] {
37-
// CHECK-NEXT: [[ENTRY:.*:]]
38-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[P0]] to <8 x i16>
39-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
40-
// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
41-
// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
42-
// CHECK-NEXT: [[FMMLA1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16(<8 x half> [[FMMLA_I]], <16 x i8> [[P1]], <16 x i8> [[P2]])
43-
// CHECK-NEXT: ret <8 x half> [[FMMLA1_I]]
44-
//
45-
float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
46-
return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3);
47-
}
48-
49-
// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8(
50-
// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] {
51-
// CHECK-NEXT: [[ENTRY:.*:]]
52-
// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
53-
// CHECK-NEXT: [[FMMLA_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32(<4 x float> [[P0]], <16 x i8> [[P1]], <16 x i8> [[P2]])
54-
// CHECK-NEXT: ret <4 x float> [[FMMLA_I]]
55-
//
56-
float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
57-
return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3);
58-
}
59-
6035
// CHECK-LABEL: define dso_local <4 x i32> @test_vusmmlaq_s32(
6136
// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
6237
// CHECK-NEXT: [[ENTRY:.*:]]
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2+
// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v9.6a -target-feature +f8f16mm -target-feature +f8f32mm -target-feature +fp8 \
3+
// RUN: -disable-O0-optnone -emit-llvm -o - %s \
4+
// RUN: | opt -S -passes=mem2reg,sroa \
5+
// RUN: | FileCheck %s
6+
7+
// REQUIRES: aarch64-registered-target
8+
9+
#include <arm_neon.h>
10+
11+
// CHECK-LABEL: define dso_local <8 x half> @test_vmmlaq_f16_mf8(
12+
// CHECK-SAME: <8 x half> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0:[0-9]+]] {
13+
// CHECK-NEXT: [[ENTRY:.*:]]
14+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[P0]] to <8 x i16>
15+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
16+
// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
17+
// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
18+
// CHECK-NEXT: [[FMMLA1_I:%.*]] = bitcast <16 x i8> [[P1]] to <8 x half>
19+
// CHECK-NEXT: [[FMMLA2_I:%.*]] = bitcast <16 x i8> [[P2]] to <8 x half>
20+
// CHECK-NEXT: [[FMMLA3_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v8f16(<8 x half> [[FMMLA_I]], <8 x half> [[FMMLA1_I]], <8 x half> [[FMMLA2_I]])
21+
// CHECK-NEXT: ret <8 x half> [[FMMLA3_I]]
22+
//
23+
float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
24+
return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3);
25+
}
26+
27+
// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8(
28+
// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] {
29+
// CHECK-NEXT: [[ENTRY:.*:]]
30+
// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
31+
// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[P1]] to <4 x float>
32+
// CHECK-NEXT: [[FMMLA1_I:%.*]] = bitcast <16 x i8> [[P2]] to <4 x float>
33+
// CHECK-NEXT: [[FMMLA2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v4f32(<4 x float> [[P0]], <4 x float> [[FMMLA_I]], <4 x float> [[FMMLA1_I]])
34+
// CHECK-NEXT: ret <4 x float> [[FMMLA2_I]]
35+
//
36+
float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
37+
return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3);
38+
}
39+

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -217,11 +217,6 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
217217
: DefaultAttrsIntrinsic<[llvm_v4f32_ty],
218218
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
219219
[IntrNoMem]>;
220-
221-
class AdvSIMD_MatMul_fpm_Intrinsic
222-
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
223-
[LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty],
224-
[IntrNoMem]>;
225220
}
226221

227222
// Arithmetic ops
@@ -504,7 +499,7 @@ let TargetPrefix = "aarch64" in {
504499
def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
505500
def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
506501
def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
507-
def int_aarch64_neon_fmmla : AdvSIMD_MatMul_fpm_Intrinsic;
502+
def int_aarch64_neon_fmmla : AdvSIMD_MatMul_Intrinsic;
508503
def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
509504
def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
510505
def int_aarch64_neon_bfmmla

0 commit comments

Comments
 (0)