Skip to content

Commit ed8ad46

Browse files
authored
[AArch64][GlobalISel] Added usdot intrinsic support (#162615)
GlobalISel now selects usdot intrinsic, without falling back to SDAG.
1 parent 4f23767 commit ed8ad46

File tree

3 files changed

+68
-28
lines changed

3 files changed

+68
-28
lines changed

llvm/lib/Target/AArch64/AArch64InstrGISel.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,12 @@ def G_SDOT : AArch64GenericInstruction {
233233
let hasSideEffects = 0;
234234
}
235235

236+
def G_USDOT : AArch64GenericInstruction {
237+
let OutOperandList = (outs type0:$dst);
238+
let InOperandList = (ins type0:$src1, type0:$src2, type0:$src3);
239+
let hasSideEffects = 0;
240+
}
241+
236242
// Generic instruction for the BSP pseudo. It is expanded into BSP, which
237243
// expands into BSL/BIT/BIF after register allocation.
238244
def G_BSP : AArch64GenericInstruction {
@@ -278,6 +284,7 @@ def : GINodeEquiv<G_UADDLV, AArch64uaddlv>;
278284

279285
def : GINodeEquiv<G_UDOT, AArch64udot>;
280286
def : GINodeEquiv<G_SDOT, AArch64sdot>;
287+
def : GINodeEquiv<G_USDOT, AArch64usdot>;
281288

282289
def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
283290

llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1855,6 +1855,8 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
18551855
return LowerTriOp(AArch64::G_UDOT);
18561856
case Intrinsic::aarch64_neon_sdot:
18571857
return LowerTriOp(AArch64::G_SDOT);
1858+
case Intrinsic::aarch64_neon_usdot:
1859+
return LowerTriOp(AArch64::G_USDOT);
18581860
case Intrinsic::aarch64_neon_sqxtn:
18591861
return LowerUnaryOp(TargetOpcode::G_TRUNC_SSAT_S);
18601862
case Intrinsic::aarch64_neon_sqxtun:

llvm/test/CodeGen/AArch64/aarch64-matmul.ll

Lines changed: 59 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,54 @@
1-
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s -o -| FileCheck %s
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s | FileCheck %s
3+
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s
24

35
define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
6+
; CHECK-LABEL: smmla.v4i32.v16i8:
7+
; CHECK: // %bb.0: // %entry
8+
; CHECK-NEXT: smmla v0.4s, v1.16b, v2.16b
9+
; CHECK-NEXT: ret
410
entry:
5-
; CHECK-LABEL: smmla.v4i32.v16i8
6-
; CHECK: smmla v0.4s, v1.16b, v2.16b
711
%vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
812
ret <4 x i32> %vmmla1.i
913
}
1014

1115
define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
16+
; CHECK-LABEL: ummla.v4i32.v16i8:
17+
; CHECK: // %bb.0: // %entry
18+
; CHECK-NEXT: ummla v0.4s, v1.16b, v2.16b
19+
; CHECK-NEXT: ret
1220
entry:
13-
; CHECK-LABEL: ummla.v4i32.v16i8
14-
; CHECK: ummla v0.4s, v1.16b, v2.16b
1521
%vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
1622
ret <4 x i32> %vmmla1.i
1723
}
1824

1925
define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
26+
; CHECK-LABEL: usmmla.v4i32.v16i8:
27+
; CHECK: // %bb.0: // %entry
28+
; CHECK-NEXT: usmmla v0.4s, v1.16b, v2.16b
29+
; CHECK-NEXT: ret
2030
entry:
21-
; CHECK-LABEL: usmmla.v4i32.v16i8
22-
; CHECK: usmmla v0.4s, v1.16b, v2.16b
2331
%vusmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
2432
ret <4 x i32> %vusmmla1.i
2533
}
2634

2735
define <2 x i32> @usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
36+
; CHECK-LABEL: usdot.v2i32.v8i8:
37+
; CHECK: // %bb.0: // %entry
38+
; CHECK-NEXT: usdot v0.2s, v1.8b, v2.8b
39+
; CHECK-NEXT: ret
2840
entry:
29-
; CHECK-LABEL: usdot.v2i32.v8i8
30-
; CHECK: usdot v0.2s, v1.8b, v2.8b
3141
%vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
3242
ret <2 x i32> %vusdot1.i
3343
}
3444

3545
define <2 x i32> @usdot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
46+
; CHECK-LABEL: usdot_lane.v2i32.v8i8:
47+
; CHECK: // %bb.0: // %entry
48+
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
49+
; CHECK-NEXT: usdot v0.2s, v1.8b, v2.4b[0]
50+
; CHECK-NEXT: ret
3651
entry:
37-
; CHECK-LABEL: usdot_lane.v2i32.v8i8
38-
; CHECK: usdot v0.2s, v1.8b, v2.4b[0]
3952
%0 = bitcast <8 x i8> %b to <2 x i32>
4053
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
4154
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -44,9 +57,12 @@ entry:
4457
}
4558

4659
define <2 x i32> @sudot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
60+
; CHECK-LABEL: sudot_lane.v2i32.v8i8:
61+
; CHECK: // %bb.0: // %entry
62+
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
63+
; CHECK-NEXT: sudot v0.2s, v1.8b, v2.4b[0]
64+
; CHECK-NEXT: ret
4765
entry:
48-
; CHECK-LABEL: sudot_lane.v2i32.v8i8
49-
; CHECK: sudot v0.2s, v1.8b, v2.4b[0]
5066
%0 = bitcast <8 x i8> %b to <2 x i32>
5167
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
5268
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -55,9 +71,11 @@ entry:
5571
}
5672

5773
define <2 x i32> @usdot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
74+
; CHECK-LABEL: usdot_lane.v2i32.v16i8:
75+
; CHECK: // %bb.0: // %entry
76+
; CHECK-NEXT: usdot v0.2s, v1.8b, v2.4b[0]
77+
; CHECK-NEXT: ret
5878
entry:
59-
; CHECK-LABEL: usdot_lane.v2i32.v16i8
60-
; CHECK: usdot v0.2s, v1.8b, v2.4b[0]
6179
%0 = bitcast <16 x i8> %b to <4 x i32>
6280
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
6381
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -66,9 +84,11 @@ entry:
6684
}
6785

6886
define <2 x i32> @sudot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
87+
; CHECK-LABEL: sudot_lane.v2i32.v16i8:
88+
; CHECK: // %bb.0: // %entry
89+
; CHECK-NEXT: sudot v0.2s, v1.8b, v2.4b[0]
90+
; CHECK-NEXT: ret
6991
entry:
70-
; CHECK-LABEL: sudot_lane.v2i32.v16i8
71-
; CHECK: sudot v0.2s, v1.8b, v2.4b[0]
7292
%0 = bitcast <16 x i8> %b to <4 x i32>
7393
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
7494
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -77,17 +97,22 @@ entry:
7797
}
7898

7999
define <4 x i32> @usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
100+
; CHECK-LABEL: usdot.v4i32.v16i8:
101+
; CHECK: // %bb.0: // %entry
102+
; CHECK-NEXT: usdot v0.4s, v1.16b, v2.16b
103+
; CHECK-NEXT: ret
80104
entry:
81-
; CHECK-LABEL: usdot.v4i32.v16i8
82-
; CHECK: usdot v0.4s, v1.16b, v2.16b
83105
%vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
84106
ret <4 x i32> %vusdot1.i
85107
}
86108

87109
define <4 x i32> @usdot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
110+
; CHECK-LABEL: usdot_lane.v4i32.v16i8:
111+
; CHECK: // %bb.0: // %entry
112+
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
113+
; CHECK-NEXT: usdot v0.4s, v1.16b, v2.4b[0]
114+
; CHECK-NEXT: ret
88115
entry:
89-
; CHECK-LABEL: usdot_lane.v4i32.v16i8
90-
; CHECK: usdot v0.4s, v1.16b, v2.4b[0]
91116
%0 = bitcast <8 x i8> %b to <2 x i32>
92117
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
93118
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -96,9 +121,12 @@ entry:
96121
}
97122

98123
define <4 x i32> @sudot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
124+
; CHECK-LABEL: sudot_lane.v4i32.v16i8:
125+
; CHECK: // %bb.0: // %entry
126+
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
127+
; CHECK-NEXT: sudot v0.4s, v1.16b, v2.4b[0]
128+
; CHECK-NEXT: ret
99129
entry:
100-
; CHECK-LABEL: sudot_lane.v4i32.v16i8
101-
; CHECK: sudot v0.4s, v1.16b, v2.4b[0]
102130
%0 = bitcast <8 x i8> %b to <2 x i32>
103131
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
104132
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -107,9 +135,11 @@ entry:
107135
}
108136

109137
define <4 x i32> @usdot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
138+
; CHECK-LABEL: usdot_laneq.v4i32.v16i8:
139+
; CHECK: // %bb.0: // %entry
140+
; CHECK-NEXT: usdot v0.4s, v1.16b, v2.4b[0]
141+
; CHECK-NEXT: ret
110142
entry:
111-
; CHECK-LABEL: usdot_laneq.v4i32.v16i8
112-
; CHECK: usdot v0.4s, v1.16b, v2.4b[0]
113143
%0 = bitcast <16 x i8> %b to <4 x i32>
114144
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
115145
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -118,9 +148,11 @@ entry:
118148
}
119149

120150
define <4 x i32> @sudot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
151+
; CHECK-LABEL: sudot_laneq.v4i32.v16i8:
152+
; CHECK: // %bb.0: // %entry
153+
; CHECK-NEXT: sudot v0.4s, v1.16b, v2.4b[0]
154+
; CHECK-NEXT: ret
121155
entry:
122-
; CHECK-LABEL: sudot_laneq.v4i32.v16i8
123-
; CHECK: sudot v0.4s, v1.16b, v2.4b[0]
124156
%0 = bitcast <16 x i8> %b to <4 x i32>
125157
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
126158
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -133,4 +165,3 @@ declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16
133165
declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
134166
declare <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2
135167
declare <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
136-

0 commit comments

Comments
 (0)