Skip to content

Commit bdcaa00

Browse files
authored
[AArch64] match TRN starting from undef elements (#167955)
When the first element of a trn mask is undef, the `isTRNMask` function assumes `WhichResult = 1`. That has a 50% chance of being wrong, so we fail to match some valid trn1/trn2. This patch introduces a more precise test to determine the correct value of `WhichResult`, based on corresponding code in the `isZIPMask` and `isUZPMask` functions. - This change is based on #89578. I'd like to follow it up with a further change along the lines of #167235.
1 parent c32c1d0 commit bdcaa00

File tree

5 files changed

+377
-252
lines changed

5 files changed

+377
-252
lines changed

llvm/lib/Target/AArch64/AArch64PerfectShuffle.h

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6685,15 +6685,30 @@ inline bool isUZPMask(ArrayRef<int> M, unsigned NumElts,
66856685
/// <0, 8, 2, 10, 4, 12, 6, 14> or
66866686
/// <1, 9, 3, 11, 5, 13, 7, 15>
66876687
inline bool isTRNMask(ArrayRef<int> M, unsigned NumElts,
6688-
unsigned &WhichResult) {
6688+
unsigned &WhichResultOut) {
66896689
if (NumElts % 2 != 0)
66906690
return false;
6691-
WhichResult = (M[0] == 0 ? 0 : 1);
6691+
// Check the first non-undef element for trn1 vs trn2.
6692+
unsigned WhichResult = 2;
6693+
for (unsigned i = 0; i != NumElts; i += 2) {
6694+
if (M[i] >= 0) {
6695+
WhichResult = ((unsigned)M[i] == i ? 0 : 1);
6696+
break;
6697+
}
6698+
if (M[i + 1] >= 0) {
6699+
WhichResult = ((unsigned)M[i + 1] == i + NumElts ? 0 : 1);
6700+
break;
6701+
}
6702+
}
6703+
if (WhichResult == 2)
6704+
return false;
6705+
66926706
for (unsigned i = 0; i < NumElts; i += 2) {
66936707
if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
66946708
(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
66956709
return false;
66966710
}
6711+
WhichResultOut = WhichResult;
66976712
return true;
66986713
}
66996714

llvm/test/CodeGen/AArch64/arm64-trn.ll

Lines changed: 113 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ define <4 x float> @vtrnQf(ptr %A, ptr %B) nounwind {
246246
ret <4 x float> %tmp5
247247
}
248248

249-
; Undef shuffle indices should not prevent matching to VTRN:
249+
; Undef shuffle indices (even at the start of the shuffle mask) should not prevent matching to VTRN:
250250

251251
define <8 x i8> @vtrni8_undef(ptr %A, ptr %B) nounwind {
252252
; CHECKLE-LABEL: vtrni8_undef:
@@ -302,3 +302,115 @@ define <8 x i16> @vtrnQi16_undef(ptr %A, ptr %B) nounwind {
302302
%tmp5 = add <8 x i16> %tmp3, %tmp4
303303
ret <8 x i16> %tmp5
304304
}
305+
306+
define <8 x i16> @vtrnQi16_undef_01(ptr %A, ptr %B) nounwind {
307+
; CHECKLE-LABEL: vtrnQi16_undef_01:
308+
; CHECKLE: // %bb.0:
309+
; CHECKLE-NEXT: ldr q0, [x0]
310+
; CHECKLE-NEXT: ldr q1, [x1]
311+
; CHECKLE-NEXT: trn1 v2.8h, v0.8h, v1.8h
312+
; CHECKLE-NEXT: trn2 v0.8h, v0.8h, v1.8h
313+
; CHECKLE-NEXT: add v0.8h, v2.8h, v0.8h
314+
; CHECKLE-NEXT: ret
315+
;
316+
; CHECKBE-LABEL: vtrnQi16_undef_01:
317+
; CHECKBE: // %bb.0:
318+
; CHECKBE-NEXT: ld1 { v0.8h }, [x0]
319+
; CHECKBE-NEXT: ld1 { v1.8h }, [x1]
320+
; CHECKBE-NEXT: trn1 v2.8h, v0.8h, v1.8h
321+
; CHECKBE-NEXT: trn2 v0.8h, v0.8h, v1.8h
322+
; CHECKBE-NEXT: add v0.8h, v2.8h, v0.8h
323+
; CHECKBE-NEXT: rev64 v0.8h, v0.8h
324+
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
325+
; CHECKBE-NEXT: ret
326+
%tmp1 = load <8 x i16>, ptr %A
327+
%tmp2 = load <8 x i16>, ptr %B
328+
%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 poison, i32 poison, i32 2, i32 poison, i32 4, i32 12, i32 6, i32 14>
329+
%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 poison, i32 poison, i32 3, i32 11, i32 poison, i32 13, i32 poison, i32 poison>
330+
%tmp5 = add <8 x i16> %tmp3, %tmp4
331+
ret <8 x i16> %tmp5
332+
}
333+
334+
define <8 x i16> @vtrnQi16_undef_0(ptr %A, ptr %B) nounwind {
335+
; CHECKLE-LABEL: vtrnQi16_undef_0:
336+
; CHECKLE: // %bb.0:
337+
; CHECKLE-NEXT: ldr q0, [x0]
338+
; CHECKLE-NEXT: ldr q1, [x1]
339+
; CHECKLE-NEXT: trn1 v2.8h, v0.8h, v1.8h
340+
; CHECKLE-NEXT: trn2 v0.8h, v0.8h, v1.8h
341+
; CHECKLE-NEXT: add v0.8h, v2.8h, v0.8h
342+
; CHECKLE-NEXT: ret
343+
;
344+
; CHECKBE-LABEL: vtrnQi16_undef_0:
345+
; CHECKBE: // %bb.0:
346+
; CHECKBE-NEXT: ld1 { v0.8h }, [x0]
347+
; CHECKBE-NEXT: ld1 { v1.8h }, [x1]
348+
; CHECKBE-NEXT: trn1 v2.8h, v0.8h, v1.8h
349+
; CHECKBE-NEXT: trn2 v0.8h, v0.8h, v1.8h
350+
; CHECKBE-NEXT: add v0.8h, v2.8h, v0.8h
351+
; CHECKBE-NEXT: rev64 v0.8h, v0.8h
352+
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
353+
; CHECKBE-NEXT: ret
354+
%tmp1 = load <8 x i16>, ptr %A
355+
%tmp2 = load <8 x i16>, ptr %B
356+
%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 poison, i32 8, i32 poison, i32 poison, i32 4, i32 12, i32 6, i32 14>
357+
%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 poison, i32 9, i32 3, i32 11, i32 5, i32 13, i32 poison, i32 poison>
358+
%tmp5 = add <8 x i16> %tmp3, %tmp4
359+
ret <8 x i16> %tmp5
360+
}
361+
362+
define <4 x i32> @vtrnQi32_undef_1(ptr %A, ptr %B) nounwind {
363+
; CHECKLE-LABEL: vtrnQi32_undef_1:
364+
; CHECKLE: // %bb.0:
365+
; CHECKLE-NEXT: ldr q0, [x0]
366+
; CHECKLE-NEXT: ldr q1, [x1]
367+
; CHECKLE-NEXT: trn1 v2.4s, v0.4s, v1.4s
368+
; CHECKLE-NEXT: trn2 v0.4s, v0.4s, v1.4s
369+
; CHECKLE-NEXT: add v0.4s, v2.4s, v0.4s
370+
; CHECKLE-NEXT: ret
371+
;
372+
; CHECKBE-LABEL: vtrnQi32_undef_1:
373+
; CHECKBE: // %bb.0:
374+
; CHECKBE-NEXT: ld1 { v0.4s }, [x0]
375+
; CHECKBE-NEXT: ld1 { v1.4s }, [x1]
376+
; CHECKBE-NEXT: trn1 v2.4s, v0.4s, v1.4s
377+
; CHECKBE-NEXT: trn2 v0.4s, v0.4s, v1.4s
378+
; CHECKBE-NEXT: add v0.4s, v2.4s, v0.4s
379+
; CHECKBE-NEXT: rev64 v0.4s, v0.4s
380+
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
381+
; CHECKBE-NEXT: ret
382+
%tmp1 = load <4 x i32>, ptr %A
383+
%tmp2 = load <4 x i32>, ptr %B
384+
%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
385+
%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 poison, i32 3, i32 7>
386+
%tmp5 = add <4 x i32> %tmp3, %tmp4
387+
ret <4 x i32> %tmp5
388+
}
389+
390+
define <16 x i8> @vtrnQi8_undef_012(ptr %A, ptr %B) nounwind {
391+
; CHECKLE-LABEL: vtrnQi8_undef_012:
392+
; CHECKLE: // %bb.0:
393+
; CHECKLE-NEXT: ldr q0, [x0]
394+
; CHECKLE-NEXT: ldr q1, [x1]
395+
; CHECKLE-NEXT: trn1 v2.16b, v0.16b, v1.16b
396+
; CHECKLE-NEXT: trn2 v0.16b, v0.16b, v1.16b
397+
; CHECKLE-NEXT: add v0.16b, v2.16b, v0.16b
398+
; CHECKLE-NEXT: ret
399+
;
400+
; CHECKBE-LABEL: vtrnQi8_undef_012:
401+
; CHECKBE: // %bb.0:
402+
; CHECKBE-NEXT: ld1 { v0.16b }, [x0]
403+
; CHECKBE-NEXT: ld1 { v1.16b }, [x1]
404+
; CHECKBE-NEXT: trn1 v2.16b, v0.16b, v1.16b
405+
; CHECKBE-NEXT: trn2 v0.16b, v0.16b, v1.16b
406+
; CHECKBE-NEXT: add v0.16b, v2.16b, v0.16b
407+
; CHECKBE-NEXT: rev64 v0.16b, v0.16b
408+
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
409+
; CHECKBE-NEXT: ret
410+
%tmp1 = load <16 x i8>, ptr %A
411+
%tmp2 = load <16 x i8>, ptr %B
412+
%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 18, i32 4, i32 poison, i32 6, i32 22, i32 poison, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
413+
%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 19, i32 5, i32 21, i32 7, i32 poison, i32 9, i32 25, i32 11, i32 27, i32 poison, i32 29, i32 15, i32 31>
414+
%tmp5 = add <16 x i8> %tmp3, %tmp4
415+
ret <16 x i8> %tmp5
416+
}

llvm/test/CodeGen/AArch64/insert-extend.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -85,24 +85,24 @@ define i32 @large(ptr nocapture noundef readonly %p1, i32 noundef %st1, ptr noca
8585
; CHECK-NEXT: addp v2.4s, v3.4s, v2.4s
8686
; CHECK-NEXT: zip1 v16.4s, v5.4s, v4.4s
8787
; CHECK-NEXT: sub v7.4s, v3.4s, v7.4s
88+
; CHECK-NEXT: trn1 v4.4s, v5.4s, v4.4s
8889
; CHECK-NEXT: zip2 v3.4s, v6.4s, v7.4s
8990
; CHECK-NEXT: mov v6.s[1], v7.s[0]
90-
; CHECK-NEXT: ext v7.16b, v5.16b, v16.16b, #8
91-
; CHECK-NEXT: mov v5.s[3], v4.s[2]
92-
; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #8
93-
; CHECK-NEXT: mov v6.d[1], v7.d[1]
94-
; CHECK-NEXT: mov v3.d[1], v5.d[1]
95-
; CHECK-NEXT: uzp1 v1.4s, v4.4s, v0.4s
96-
; CHECK-NEXT: uzp2 v4.4s, v4.4s, v0.4s
91+
; CHECK-NEXT: ext v7.16b, v2.16b, v2.16b, #8
92+
; CHECK-NEXT: ext v5.16b, v5.16b, v16.16b, #8
93+
; CHECK-NEXT: mov v3.d[1], v4.d[1]
94+
; CHECK-NEXT: uzp1 v1.4s, v7.4s, v0.4s
95+
; CHECK-NEXT: uzp2 v4.4s, v7.4s, v0.4s
96+
; CHECK-NEXT: mov v6.d[1], v5.d[1]
9797
; CHECK-NEXT: addp v0.4s, v2.4s, v0.4s
98+
; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s
99+
; CHECK-NEXT: rev64 v7.4s, v0.4s
98100
; CHECK-NEXT: add v5.4s, v3.4s, v6.4s
99101
; CHECK-NEXT: sub v3.4s, v6.4s, v3.4s
100-
; CHECK-NEXT: rev64 v7.4s, v0.4s
101-
; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s
102+
; CHECK-NEXT: rev64 v2.4s, v1.4s
102103
; CHECK-NEXT: rev64 v4.4s, v5.4s
103104
; CHECK-NEXT: rev64 v6.4s, v3.4s
104105
; CHECK-NEXT: addp v16.4s, v0.4s, v5.4s
105-
; CHECK-NEXT: rev64 v2.4s, v1.4s
106106
; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s
107107
; CHECK-NEXT: zip1 v21.4s, v16.4s, v16.4s
108108
; CHECK-NEXT: sub v4.4s, v5.4s, v4.4s

0 commit comments

Comments
 (0)