Skip to content

Commit 22f5bb7

Browse files
[Clang][LLVM][AArch64] Add intrinsic for LUTI4 SME2 instruction
This patch adds these intrinsics: // Variants are also available for: _s8 svuint8x4_t svluti4_zt_u8_x4(uint64_t zt0, svuint8x2_t zn) __arm_streaming __arm_in("zt0"); according to PR#324[1] [1]ARM-software/acle#324
1 parent 76c84e7 commit 22f5bb7

File tree

7 files changed

+132
-4
lines changed

7 files changed

+132
-4
lines changed

clang/include/clang/Basic/arm_sme.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -817,4 +817,9 @@ multiclass ZAReadzArray<string vg_num>{
817817

818818
defm SVREADZ_VG2 : ZAReadzArray<"2">;
819819
defm SVREADZ_VG4 : ZAReadzArray<"4">;
820+
821+
let SMETargetGuard = "sme2,sme-lutv2" in {
822+
def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInOutZT0], [ImmCheck<0, ImmCheck0_0>]>;
823+
}
824+
820825
} // let SVETargetGuard = InvalidMode
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
3+
// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CXX
4+
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
5+
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -x c++ -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CXX
6+
7+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -S -Werror -Wall -o /dev/null %s
8+
// REQUIRES: aarch64-registered-target
9+
10+
#include <arm_sme.h>
11+
12+
// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_luti4_zt_u8_x4(
13+
// CHECK-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
14+
// CHECK-NEXT: [[ENTRY:.*:]]
15+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0)
16+
// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16)
17+
// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
18+
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
19+
// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0)
20+
// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
21+
// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16)
22+
// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
23+
// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 32)
24+
// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3
25+
// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 48)
26+
// CHECK-NEXT: ret <vscale x 64 x i8> [[TMP10]]
27+
//
28+
// CHECK-CXX-LABEL: define dso_local <vscale x 64 x i8> @_Z19test_luti4_zt_u8_x411svuint8x2_t(
29+
// CHECK-CXX-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
30+
// CHECK-CXX-NEXT: [[ENTRY:.*:]]
31+
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0)
32+
// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16)
33+
// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
34+
// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
35+
// CHECK-CXX-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0)
36+
// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
37+
// CHECK-CXX-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16)
38+
// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
39+
// CHECK-CXX-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 32)
40+
// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3
41+
// CHECK-CXX-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 48)
42+
// CHECK-CXX-NEXT: ret <vscale x 64 x i8> [[TMP10]]
43+
//
44+
svuint8x4_t test_luti4_zt_u8_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") {
45+
return svluti4_zt_u8_x4(0, op);
46+
}
47+
48+
// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_luti4_zt_s8_x4(
49+
// CHECK-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr #[[ATTR0]] {
50+
// CHECK-NEXT: [[ENTRY:.*:]]
51+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0)
52+
// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16)
53+
// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
54+
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
55+
// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0)
56+
// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
57+
// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16)
58+
// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
59+
// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 32)
60+
// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3
61+
// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 48)
62+
// CHECK-NEXT: ret <vscale x 64 x i8> [[TMP10]]
63+
//
64+
// CHECK-CXX-LABEL: define dso_local <vscale x 64 x i8> @_Z19test_luti4_zt_s8_x410svint8x2_t(
65+
// CHECK-CXX-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr #[[ATTR0]] {
66+
// CHECK-CXX-NEXT: [[ENTRY:.*:]]
67+
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0)
68+
// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16)
69+
// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
70+
// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
71+
// CHECK-CXX-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0)
72+
// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
73+
// CHECK-CXX-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16)
74+
// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
75+
// CHECK-CXX-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 32)
76+
// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3
77+
// CHECK-CXX-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 48)
78+
// CHECK-CXX-NEXT: ret <vscale x 64 x i8> [[TMP10]]
79+
//
80+
svint8x4_t test_luti4_zt_s8_x4(svint8x2_t op) __arm_streaming __arm_in("zt0") {
81+
return svluti4_zt_s8_x4(0, op);
82+
}

clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,3 +350,8 @@ void test_svdot_multi_za32_bad_lane(uint32_t slice_base, svuint16_t z_u16,
350350
svsudot_lane_za32_s8_vg1x2(slice_base, z_s8x2, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
351351
svsudot_lane_za32_s8_vg1x4(slice_base, z_s8x4, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
352352
}
353+
354+
void test_rluti4_zt_x4(svuint8x2_t op) __arm_streaming __arm_inout("zt0") {
355+
// Check Zt tile 0
356+
svluti4_zt_u8_x4(1, op); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
357+
}

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3678,6 +3678,12 @@ let TargetPrefix = "aarch64" in {
36783678
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
36793679
[llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
36803680
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrReadMem]>;
3681+
3682+
def int_aarch64_sme_luti4_zt_x4
3683+
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
3684+
[llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
3685+
[ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
3686+
36813687
}
36823688

36833689
// SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2

llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
410410
}
411411

412412
void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
413-
uint32_t MaxImm);
413+
uint32_t MaxImm, bool IsMultiVector = false);
414414

415415
template <unsigned MaxIdx, unsigned Scale>
416416
bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) {
@@ -1896,15 +1896,23 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs,
18961896

18971897
void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
18981898
unsigned NumOutVecs,
1899-
unsigned Opc, uint32_t MaxImm) {
1899+
unsigned Opc, uint32_t MaxImm,
1900+
bool IsMultiVector) {
19001901
if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Node->getOperand(4)))
19011902
if (Imm->getZExtValue() > MaxImm)
19021903
return;
19031904

19041905
SDValue ZtValue;
1906+
SmallVector<SDValue, 4> Ops;
19051907
if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
19061908
return;
1907-
SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)};
1909+
Ops.push_back(ZtValue);
1910+
if (IsMultiVector) {
1911+
Ops.push_back(createZMulTuple({Node->getOperand(3), Node->getOperand(4)}));
1912+
} else {
1913+
Ops.push_back(Node->getOperand(3));
1914+
Ops.push_back(Node->getOperand(4));
1915+
}
19081916
SDLoc DL(Node);
19091917
EVT VT = Node->getValueType(0);
19101918

@@ -5415,6 +5423,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
54155423
SelectMultiVectorLuti(Node, 2, Opc, 3);
54165424
return;
54175425
}
5426+
case Intrinsic::aarch64_sme_luti4_zt_x4: {
5427+
// Does not have immediate but it has 2ZPR input
5428+
SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z, 0, true);
5429+
return;
5430+
}
54185431
}
54195432
} break;
54205433
case ISD::INTRINSIC_WO_CHAIN: {

llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -936,7 +936,7 @@ defm FAMIN_4Z4Z : sme2_fp_sve_destructive_vector_vg4_multi<"famin", 0b0010101>;
936936

937937
let Predicates = [HasSME2, HasSME_LUTv2] in {
938938
defm MOVT : sme2_movt_zt_to_zt<"movt", 0b0011111>;
939-
def LUTI4_4ZZT2Z : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">;
939+
def LUTI4_4ZZT2Z : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">;
940940
} //[HasSME2, HasSME_LUTv2]
941941

942942
let Predicates = [HasSME2p1, HasSME_LUTv2] in {
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -verify-machineinstrs -force-streaming < %s | FileCheck %s
3+
4+
target triple = "aarch64-linux"
5+
6+
define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @test_luti4_zt_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1) #0 {
7+
; CHECK-LABEL: test_luti4_zt_i8:
8+
; CHECK: // %bb.0:
9+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
10+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
11+
; CHECK-NEXT: luti4 { z0.b - z3.b }, zt0, { z0, z1 }
12+
; CHECK-NEXT: ret
13+
%res = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1)
14+
ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %res
15+
}
16+
17+
attributes #0 = { "target-features"="+sme2,+sme-lutv2"}

0 commit comments

Comments
 (0)