@@ -26,6 +26,66 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
2626 ret <4 x i32 > %partial.reduce
2727}
2828
29+ define <4 x i32 > @udot_in_loop (ptr %p1 , ptr %p2 ){
30+ ; CHECK-DOT-LABEL: udot_in_loop:
31+ ; CHECK-DOT: // %bb.0: // %entry
32+ ; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000
33+ ; CHECK-DOT-NEXT: mov x8, xzr
34+ ; CHECK-DOT-NEXT: .LBB3_1: // %vector.body
35+ ; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1
36+ ; CHECK-DOT-NEXT: ldr q2, [x0, x8]
37+ ; CHECK-DOT-NEXT: ldr q3, [x1, x8]
38+ ; CHECK-DOT-NEXT: mov v0.16b, v1.16b
39+ ; CHECK-DOT-NEXT: add x8, x8, #16
40+ ; CHECK-DOT-NEXT: udot v1.4s, v2.16b, v3.16b
41+ ; CHECK-DOT-NEXT: cmp x8, #16
42+ ; CHECK-DOT-NEXT: b.ne .LBB3_1
43+ ; CHECK-DOT-NEXT: // %bb.2: // %end
44+ ; CHECK-DOT-NEXT: ret
45+ ;
46+ ; CHECK-NODOT-LABEL: udot_in_loop:
47+ ; CHECK-NODOT: // %bb.0: // %entry
48+ ; CHECK-NODOT-NEXT: movi v1.2d, #0000000000000000
49+ ; CHECK-NODOT-NEXT: mov x8, xzr
50+ ; CHECK-NODOT-NEXT: .LBB3_1: // %vector.body
51+ ; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1
52+ ; CHECK-NODOT-NEXT: ldr q0, [x0, x8]
53+ ; CHECK-NODOT-NEXT: ldr q2, [x1, x8]
54+ ; CHECK-NODOT-NEXT: add x8, x8, #16
55+ ; CHECK-NODOT-NEXT: cmp x8, #16
56+ ; CHECK-NODOT-NEXT: umull v3.8h, v0.8b, v2.8b
57+ ; CHECK-NODOT-NEXT: umull2 v2.8h, v0.16b, v2.16b
58+ ; CHECK-NODOT-NEXT: mov v0.16b, v1.16b
59+ ; CHECK-NODOT-NEXT: ushll v1.4s, v2.4h, #0
60+ ; CHECK-NODOT-NEXT: uaddw v4.4s, v0.4s, v3.4h
61+ ; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v3.8h
62+ ; CHECK-NODOT-NEXT: uaddw2 v2.4s, v4.4s, v2.8h
63+ ; CHECK-NODOT-NEXT: add v1.4s, v1.4s, v2.4s
64+ ; CHECK-NODOT-NEXT: b.ne .LBB3_1
65+ ; CHECK-NODOT-NEXT: // %bb.2: // %end
66+ ; CHECK-NODOT-NEXT: ret
67+ entry:
68+ br label %vector.body
69+
70+ vector.body:
71+ %index = phi i64 [ 0 , %entry ], [ %index.next , %vector.body ]
72+ %acc = phi <4 x i32 > [ zeroinitializer , %entry ], [ %partial.reduce , %vector.body ]
73+ %gep1 = getelementptr i8 , ptr %p1 , i64 %index
74+ %load1 = load <16 x i8 >, ptr %gep1 , align 16
75+ %load1.wide = zext <16 x i8 > %load1 to <16 x i32 >
76+ %gep2 = getelementptr i8 , ptr %p2 , i64 %index
77+ %load2 = load <16 x i8 >, ptr %gep2 , align 16
78+ %load2.wide = zext <16 x i8 > %load2 to <16 x i32 >
79+ %mul = mul nuw nsw <16 x i32 > %load1.wide , %load2.wide
80+ %partial.reduce = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > %acc , <16 x i32 > %mul )
81+ %index.next = add nuw i64 %index , 16
82+ %cmp = icmp eq i64 %index.next , 16
83+ br i1 %cmp , label %end , label %vector.body
84+
85+ end:
86+ ret <4 x i32 > %acc
87+ }
88+
2989define <2 x i32 > @udot_narrow (<2 x i32 > %acc , <8 x i8 > %u , <8 x i8 > %s ) {
3090; CHECK-DOT-LABEL: udot_narrow:
3191; CHECK-DOT: // %bb.0:
@@ -128,6 +188,68 @@ define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
128188 ret <4 x i32 > %partial.reduce
129189}
130190
191+ define <4 x i32 > @usdot_in_loop (ptr %p1 , ptr %p2 ){
192+ ; CHECK-NOI8MM-LABEL: usdot_in_loop:
193+ ; CHECK-NOI8MM: // %bb.0: // %entry
194+ ; CHECK-NOI8MM-NEXT: movi v1.2d, #0000000000000000
195+ ; CHECK-NOI8MM-NEXT: mov x8, xzr
196+ ; CHECK-NOI8MM-NEXT: .LBB10_1: // %vector.body
197+ ; CHECK-NOI8MM-NEXT: // =>This Inner Loop Header: Depth=1
198+ ; CHECK-NOI8MM-NEXT: ldr q0, [x0, x8]
199+ ; CHECK-NOI8MM-NEXT: ldr q2, [x1, x8]
200+ ; CHECK-NOI8MM-NEXT: add x8, x8, #16
201+ ; CHECK-NOI8MM-NEXT: cmp x8, #16
202+ ; CHECK-NOI8MM-NEXT: sshll v3.8h, v0.8b, #0
203+ ; CHECK-NOI8MM-NEXT: sshll2 v4.8h, v0.16b, #0
204+ ; CHECK-NOI8MM-NEXT: ushll v5.8h, v2.8b, #0
205+ ; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0
206+ ; CHECK-NOI8MM-NEXT: mov v0.16b, v1.16b
207+ ; CHECK-NOI8MM-NEXT: smlal v1.4s, v3.4h, v5.4h
208+ ; CHECK-NOI8MM-NEXT: smull v6.4s, v4.4h, v2.4h
209+ ; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v4.8h, v2.8h
210+ ; CHECK-NOI8MM-NEXT: smlal2 v6.4s, v3.8h, v5.8h
211+ ; CHECK-NOI8MM-NEXT: add v1.4s, v6.4s, v1.4s
212+ ; CHECK-NOI8MM-NEXT: b.ne .LBB10_1
213+ ; CHECK-NOI8MM-NEXT: // %bb.2: // %end
214+ ; CHECK-NOI8MM-NEXT: ret
215+ ;
216+ ; CHECK-I8MM-LABEL: usdot_in_loop:
217+ ; CHECK-I8MM: // %bb.0: // %entry
218+ ; CHECK-I8MM-NEXT: movi v1.2d, #0000000000000000
219+ ; CHECK-I8MM-NEXT: mov x8, xzr
220+ ; CHECK-I8MM-NEXT: .LBB10_1: // %vector.body
221+ ; CHECK-I8MM-NEXT: // =>This Inner Loop Header: Depth=1
222+ ; CHECK-I8MM-NEXT: ldr q2, [x0, x8]
223+ ; CHECK-I8MM-NEXT: ldr q3, [x1, x8]
224+ ; CHECK-I8MM-NEXT: mov v0.16b, v1.16b
225+ ; CHECK-I8MM-NEXT: add x8, x8, #16
226+ ; CHECK-I8MM-NEXT: usdot v1.4s, v3.16b, v2.16b
227+ ; CHECK-I8MM-NEXT: cmp x8, #16
228+ ; CHECK-I8MM-NEXT: b.ne .LBB10_1
229+ ; CHECK-I8MM-NEXT: // %bb.2: // %end
230+ ; CHECK-I8MM-NEXT: ret
231+ entry:
232+ br label %vector.body
233+
234+ vector.body:
235+ %index = phi i64 [ 0 , %entry ], [ %index.next , %vector.body ]
236+ %acc = phi <4 x i32 > [ zeroinitializer , %entry ], [ %partial.reduce , %vector.body ]
237+ %gep1 = getelementptr i8 , ptr %p1 , i64 %index
238+ %load1 = load <16 x i8 >, ptr %gep1 , align 16
239+ %load1.wide = sext <16 x i8 > %load1 to <16 x i32 >
240+ %gep2 = getelementptr i8 , ptr %p2 , i64 %index
241+ %load2 = load <16 x i8 >, ptr %gep2 , align 16
242+ %load2.wide = zext <16 x i8 > %load2 to <16 x i32 >
243+ %mul = mul nuw nsw <16 x i32 > %load1.wide , %load2.wide
244+ %partial.reduce = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > %acc , <16 x i32 > %mul )
245+ %index.next = add nuw i64 %index , 16
246+ %cmp = icmp eq i64 %index.next , 16
247+ br i1 %cmp , label %end , label %vector.body
248+
249+ end:
250+ ret <4 x i32 > %acc
251+ }
252+
131253define <2 x i32 > @usdot_narrow (<2 x i32 > %acc , <8 x i8 > %u , <8 x i8 > %s ) #0 {
132254; CHECK-NOI8MM-LABEL: usdot_narrow:
133255; CHECK-NOI8MM: // %bb.0:
@@ -175,13 +297,75 @@ define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{
175297; CHECK-I8MM: // %bb.0:
176298; CHECK-I8MM-NEXT: usdot v0.4s, v2.16b, v1.16b
177299; CHECK-I8MM-NEXT: ret
178- %u .wide = sext <16 x i8 > %u to <16 x i32 >
179- %s .wide = zext <16 x i8 > %s to <16 x i32 >
180- %mult = mul nuw nsw <16 x i32 > %s .wide , %u .wide
300+ %s .wide = sext <16 x i8 > %u to <16 x i32 >
301+ %u .wide = zext <16 x i8 > %s to <16 x i32 >
302+ %mult = mul nuw nsw <16 x i32 > %u .wide , %s .wide
181303 %partial.reduce = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > %acc , <16 x i32 > %mult )
182304 ret <4 x i32 > %partial.reduce
183305}
184306
307+ define <4 x i32 > @sudot_in_loop (ptr %p1 , ptr %p2 ){
308+ ; CHECK-NOI8MM-LABEL: sudot_in_loop:
309+ ; CHECK-NOI8MM: // %bb.0: // %entry
310+ ; CHECK-NOI8MM-NEXT: movi v1.2d, #0000000000000000
311+ ; CHECK-NOI8MM-NEXT: mov x8, xzr
312+ ; CHECK-NOI8MM-NEXT: .LBB13_1: // %vector.body
313+ ; CHECK-NOI8MM-NEXT: // =>This Inner Loop Header: Depth=1
314+ ; CHECK-NOI8MM-NEXT: ldr q0, [x0, x8]
315+ ; CHECK-NOI8MM-NEXT: ldr q2, [x1, x8]
316+ ; CHECK-NOI8MM-NEXT: add x8, x8, #16
317+ ; CHECK-NOI8MM-NEXT: cmp x8, #16
318+ ; CHECK-NOI8MM-NEXT: ushll v3.8h, v0.8b, #0
319+ ; CHECK-NOI8MM-NEXT: ushll2 v4.8h, v0.16b, #0
320+ ; CHECK-NOI8MM-NEXT: sshll v5.8h, v2.8b, #0
321+ ; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0
322+ ; CHECK-NOI8MM-NEXT: mov v0.16b, v1.16b
323+ ; CHECK-NOI8MM-NEXT: smlal v1.4s, v3.4h, v5.4h
324+ ; CHECK-NOI8MM-NEXT: smull v6.4s, v4.4h, v2.4h
325+ ; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v4.8h, v2.8h
326+ ; CHECK-NOI8MM-NEXT: smlal2 v6.4s, v3.8h, v5.8h
327+ ; CHECK-NOI8MM-NEXT: add v1.4s, v6.4s, v1.4s
328+ ; CHECK-NOI8MM-NEXT: b.ne .LBB13_1
329+ ; CHECK-NOI8MM-NEXT: // %bb.2: // %end
330+ ; CHECK-NOI8MM-NEXT: ret
331+ ;
332+ ; CHECK-I8MM-LABEL: sudot_in_loop:
333+ ; CHECK-I8MM: // %bb.0: // %entry
334+ ; CHECK-I8MM-NEXT: movi v1.2d, #0000000000000000
335+ ; CHECK-I8MM-NEXT: mov x8, xzr
336+ ; CHECK-I8MM-NEXT: .LBB13_1: // %vector.body
337+ ; CHECK-I8MM-NEXT: // =>This Inner Loop Header: Depth=1
338+ ; CHECK-I8MM-NEXT: ldr q2, [x0, x8]
339+ ; CHECK-I8MM-NEXT: ldr q3, [x1, x8]
340+ ; CHECK-I8MM-NEXT: mov v0.16b, v1.16b
341+ ; CHECK-I8MM-NEXT: add x8, x8, #16
342+ ; CHECK-I8MM-NEXT: usdot v1.4s, v2.16b, v3.16b
343+ ; CHECK-I8MM-NEXT: cmp x8, #16
344+ ; CHECK-I8MM-NEXT: b.ne .LBB13_1
345+ ; CHECK-I8MM-NEXT: // %bb.2: // %end
346+ ; CHECK-I8MM-NEXT: ret
347+ entry:
348+ br label %vector.body
349+
350+ vector.body:
351+ %index = phi i64 [ 0 , %entry ], [ %index.next , %vector.body ]
352+ %acc = phi <4 x i32 > [ zeroinitializer , %entry ], [ %partial.reduce , %vector.body ]
353+ %gep1 = getelementptr i8 , ptr %p1 , i64 %index
354+ %load1 = load <16 x i8 >, ptr %gep1 , align 16
355+ %load1.wide = zext <16 x i8 > %load1 to <16 x i32 >
356+ %gep2 = getelementptr i8 , ptr %p2 , i64 %index
357+ %load2 = load <16 x i8 >, ptr %gep2 , align 16
358+ %load2.wide = sext <16 x i8 > %load2 to <16 x i32 >
359+ %mul = mul nuw nsw <16 x i32 > %load1.wide , %load2.wide
360+ %partial.reduce = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > %acc , <16 x i32 > %mul )
361+ %index.next = add nuw i64 %index , 16
362+ %cmp = icmp eq i64 %index.next , 16
363+ br i1 %cmp , label %end , label %vector.body
364+
365+ end:
366+ ret <4 x i32 > %acc
367+ }
368+
185369define <2 x i32 > @sudot_narrow (<2 x i32 > %acc , <8 x i8 > %u , <8 x i8 > %s ) #0 {
186370; CHECK-NOI8MM-LABEL: sudot_narrow:
187371; CHECK-NOI8MM: // %bb.0:
@@ -389,6 +573,54 @@ define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
389573 ret <4 x i32 > %partial.reduce
390574}
391575
576+ define <4 x i32 > @udot_no_bin_op_in_loop (ptr %p ){
577+ ; CHECK-LABEL: udot_no_bin_op_in_loop:
578+ ; CHECK: // %bb.0: // %entry
579+ ; CHECK-NEXT: adrp x8, .LCPI20_0
580+ ; CHECK-NEXT: movi v4.2d, #0000000000000000
581+ ; CHECK-NEXT: adrp x9, .LCPI20_2
582+ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0]
583+ ; CHECK-NEXT: adrp x8, .LCPI20_1
584+ ; CHECK-NEXT: adrp x10, .LCPI20_3
585+ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_1]
586+ ; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI20_2]
587+ ; CHECK-NEXT: ldr q5, [x10, :lo12:.LCPI20_3]
588+ ; CHECK-NEXT: mov x8, xzr
589+ ; CHECK-NEXT: .LBB20_1: // %vector.body
590+ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
591+ ; CHECK-NEXT: ldr q6, [x0, x8]
592+ ; CHECK-NEXT: mov v0.16b, v4.16b
593+ ; CHECK-NEXT: add x8, x8, #16
594+ ; CHECK-NEXT: cmp x8, #16
595+ ; CHECK-NEXT: tbl v7.16b, { v6.16b }, v2.16b
596+ ; CHECK-NEXT: tbl v4.16b, { v6.16b }, v1.16b
597+ ; CHECK-NEXT: tbl v16.16b, { v6.16b }, v3.16b
598+ ; CHECK-NEXT: tbl v6.16b, { v6.16b }, v5.16b
599+ ; CHECK-NEXT: add v7.4s, v0.4s, v7.4s
600+ ; CHECK-NEXT: add v6.4s, v6.4s, v16.4s
601+ ; CHECK-NEXT: add v4.4s, v4.4s, v7.4s
602+ ; CHECK-NEXT: add v4.4s, v6.4s, v4.4s
603+ ; CHECK-NEXT: b.ne .LBB20_1
604+ ; CHECK-NEXT: // %bb.2: // %end
605+ ; CHECK-NEXT: ret
606+ entry:
607+ br label %vector.body
608+
609+ vector.body:
610+ %index = phi i64 [ 0 , %entry ], [ %index.next , %vector.body ]
611+ %acc = phi <4 x i32 > [ zeroinitializer , %entry ], [ %partial.reduce , %vector.body ]
612+ %gep = getelementptr i8 , ptr %p , i64 %index
613+ %load = load <16 x i8 >, ptr %gep , align 16
614+ %load.wide = zext <16 x i8 > %load to <16 x i32 >
615+ %partial.reduce = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > %acc , <16 x i32 > %load.wide )
616+ %index.next = add nuw i64 %index , 16
617+ %cmp = icmp eq i64 %index.next , 16
618+ br i1 %cmp , label %end , label %vector.body
619+
620+ end:
621+ ret <4 x i32 > %acc
622+ }
623+
392624define <4 x i32 > @sdot_no_bin_op (<4 x i32 > %acc , <16 x i8 > %a ){
393625; CHECK-DOT-LABEL: sdot_no_bin_op:
394626; CHECK-DOT: // %bb.0:
0 commit comments