Skip to content

Commit 533acb2

Browse files
committed
fixup! [AArch64][llvm] Add support for vmmlaq_[f16,f32]_mf8 intrinsics
Add extra lowering
1 parent 846648d commit 533acb2

File tree

7 files changed

+64
-10
lines changed

7 files changed

+64
-10
lines changed

clang/lib/CodeGen/TargetBuiltins/ARM.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7627,12 +7627,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
76277627
case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm:
76287628
return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
76297629
{llvm::FixedVectorType::get(HalfTy, 8),
7630-
llvm::FixedVectorType::get(HalfTy, 8)},
7630+
llvm::FixedVectorType::get(Int8Ty, 16)},
76317631
Ops, E, "fmmla");
76327632
case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm:
76337633
return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
76347634
{llvm::FixedVectorType::get(FloatTy, 4),
7635-
llvm::FixedVectorType::get(FloatTy, 4)},
7635+
llvm::FixedVectorType::get(Int8Ty, 16)},
76367636
Ops, E, "fmmla");
76377637
case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
76387638
ExtractLow = true;

clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,8 @@
1515
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
1616
// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
1717
// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
18-
// CHECK-NEXT: [[FMMLA1_I:%.*]] = bitcast <16 x i8> [[P1]] to <8 x half>
19-
// CHECK-NEXT: [[FMMLA2_I:%.*]] = bitcast <16 x i8> [[P2]] to <8 x half>
20-
// CHECK-NEXT: [[FMMLA3_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v8f16(<8 x half> [[FMMLA_I]], <8 x half> [[FMMLA1_I]], <8 x half> [[FMMLA2_I]])
21-
// CHECK-NEXT: ret <8 x half> [[FMMLA3_I]]
18+
// CHECK-NEXT: [[FMMLA1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v16i8(<8 x half> [[FMMLA_I]], <16 x i8> [[P1]], <16 x i8> [[P2]])
19+
// CHECK-NEXT: ret <8 x half> [[FMMLA1_I]]
2220
//
2321
float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
2422
return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3);

clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,8 @@
1212
// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0:[0-9]+]] {
1313
// CHECK-NEXT: [[ENTRY:.*:]]
1414
// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
15-
// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[P1]] to <4 x float>
16-
// CHECK-NEXT: [[FMMLA1_I:%.*]] = bitcast <16 x i8> [[P2]] to <4 x float>
17-
// CHECK-NEXT: [[FMMLA2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v4f32(<4 x float> [[P0]], <4 x float> [[FMMLA_I]], <4 x float> [[FMMLA1_I]])
18-
// CHECK-NEXT: ret <4 x float> [[FMMLA2_I]]
15+
// CHECK-NEXT: [[FMMLA_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v16i8(<4 x float> [[P0]], <16 x i8> [[P1]], <16 x i8> [[P2]])
16+
// CHECK-NEXT: ret <4 x float> [[FMMLA_I]]
1917
//
2018
float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
2119
return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3);

llvm/lib/Target/AArch64/AArch64InstrFormats.td

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9111,6 +9111,26 @@ class SIMDThreeSameVectorMatMul<bit B, bit U, string asm, SDPatternOperator OpNo
91119111
let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b|.4s\t$Rd, $Rn, $Rm}";
91129112
}
91139113

9114+
multiclass SIMDThreeSameVectorMatMulFP<bit B, bit U, string asm, SDPatternOperator OpNode> {
9115+
let Predicates = [HasNEON, HasF8F16MM] in {
9116+
def fp16 : BaseSIMDThreeSameVectorTied<1, U, 0b000, {0b1101, B}, V128, asm, ".8h",
9117+
[(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd),
9118+
(v16i8 V128:$Rn),
9119+
(v16i8 V128:$Rm)))]> {
9120+
let AsmString = asm # "{\t$Rd.4h, $Rn.16b, $Rm.16b|.8h\t$Rd, $Rn, $Rm}";
9121+
}
9122+
}
9123+
9124+
let Predicates = [HasNEON, HasF8F32MM] in {
9125+
def fp32 : BaseSIMDThreeSameVectorTied<1, U, 0b100, {0b1101, B}, V128, asm, ".4s",
9126+
[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
9127+
(v16i8 V128:$Rn),
9128+
(v16i8 V128:$Rm)))]> {
9129+
let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b|.4s\t$Rd, $Rn, $Rm}";
9130+
}
9131+
}
9132+
}
9133+
91149134
//----------------------------------------------------------------------------
91159135
// ARMv8.2-A Dot Product Instructions (Indexed)
91169136
class BaseSIMDThreeSameVectorIndexS<bit Q, bit U, bits<2> size, bits<4> opc, string asm,

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1731,10 +1731,21 @@ def BFCVT : BF16ToSinglePrecision<"bfcvt">;
17311731
let Predicates = [HasMatMulInt8] in {
17321732
def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>;
17331733
def UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>;
1734+
defm FMMLA : SIMDThreeSameVectorMatMulFP<1, 1, "fmmla", int_aarch64_neon_fmmla>;
17341735
def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>;
17351736
defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", AArch64usdot>;
17361737
defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", AArch64usdot>;
17371738

1739+
// FMMLA fp16
1740+
def : Pat<(v8f16 (int_aarch64_neon_fmmla
1741+
(v8f16 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm))),
1742+
(FMMLAfp16 V128:$Rd, V128:$Rn, V128:$Rm)>;
1743+
1744+
// FMMLA fp32
1745+
def : Pat<(v4f32 (int_aarch64_neon_fmmla
1746+
(v4f32 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm))),
1747+
(FMMLAfp32 V128:$Rd, V128:$Rn, V128:$Rm)>;
1748+
17381749
// sudot lane has a pattern where usdot is expected (there is no sudot).
17391750
// The second operand is used in the dup operation to repeat the indexed
17401751
// element.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f16mm < %s | FileCheck %s
3+
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f16mm -global-isel < %s | FileCheck %s
4+
5+
define <8 x half> @fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) {
6+
; CHECK-LABEL: fmmla.v8f16.v16i8:
7+
; CHECK: // %bb.0: // %entry
8+
; CHECK-NEXT: fmmla v0.4h, v1.16b, v2.16b
9+
; CHECK-NEXT: ret
10+
entry:
11+
%vfmmla1.i = tail call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) #3
12+
ret <8 x half> %vfmmla1.i
13+
}
14+
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f32mm < %s | FileCheck %s
3+
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f32mm -global-isel < %s | FileCheck %s
4+
5+
define <4 x float> @fmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %a, <16 x i8> %b) {
6+
; CHECK-LABEL: fmmla.v4f32.v16i8:
7+
; CHECK: // %bb.0: // %entry
8+
; CHECK-NEXT: fmmla v0.4s, v1.16b, v2.16b
9+
; CHECK-NEXT: ret
10+
entry:
11+
%vfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %a, <16 x i8> %b) #3
12+
ret <4 x float> %vfmmla1.i
13+
}

0 commit comments

Comments
 (0)