Skip to content

Commit 7a1516a

Browse files
committed
[AArch64] match TRN starting from undef elements
When the first element of a trn mask is undef, the isTRNMask function assumes that the value for "WhichResult" should be 1. That has a 50% chance of being wrong, so we fail to match some valid trn1/trn2. This patch introduces a more precise test to determine the correct value of "WhichResult", based on corresponding code in the isZIPMask and isUZPMask functions.
1 parent e797ec6 commit 7a1516a

File tree

4 files changed

+264
-251
lines changed

4 files changed

+264
-251
lines changed

llvm/lib/Target/AArch64/AArch64PerfectShuffle.h

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6685,15 +6685,30 @@ inline bool isUZPMask(ArrayRef<int> M, unsigned NumElts,
66856685
/// <0, 8, 2, 10, 4, 12, 6, 14> or
66866686
/// <1, 9, 3, 11, 5, 13, 7, 15>
66876687
inline bool isTRNMask(ArrayRef<int> M, unsigned NumElts,
6688-
unsigned &WhichResult) {
6688+
unsigned &WhichResultOut) {
66896689
if (NumElts % 2 != 0)
66906690
return false;
6691-
WhichResult = (M[0] == 0 ? 0 : 1);
6691+
// Check the first non-undef element for trn1 vs trn2.
6692+
unsigned WhichResult = 2;
6693+
for (unsigned i = 0; i != NumElts; i += 2) {
6694+
if (M[i] >= 0) {
6695+
WhichResult = ((unsigned)M[i] == i ? 0 : 1);
6696+
break;
6697+
}
6698+
if (M[i + 1] >= 0) {
6699+
WhichResult = ((unsigned)M[i + 1] == i + NumElts ? 0 : 1);
6700+
break;
6701+
}
6702+
}
6703+
if (WhichResult == 2)
6704+
return false;
6705+
66926706
for (unsigned i = 0; i < NumElts; i += 2) {
66936707
if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
66946708
(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
66956709
return false;
66966710
}
6711+
WhichResultOut = WhichResult;
66976712
return true;
66986713
}
66996714

llvm/test/CodeGen/AArch64/insert-extend.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -85,24 +85,24 @@ define i32 @large(ptr nocapture noundef readonly %p1, i32 noundef %st1, ptr noca
8585
; CHECK-NEXT: addp v2.4s, v3.4s, v2.4s
8686
; CHECK-NEXT: zip1 v16.4s, v5.4s, v4.4s
8787
; CHECK-NEXT: sub v7.4s, v3.4s, v7.4s
88+
; CHECK-NEXT: trn1 v4.4s, v5.4s, v4.4s
8889
; CHECK-NEXT: zip2 v3.4s, v6.4s, v7.4s
8990
; CHECK-NEXT: mov v6.s[1], v7.s[0]
90-
; CHECK-NEXT: ext v7.16b, v5.16b, v16.16b, #8
91-
; CHECK-NEXT: mov v5.s[3], v4.s[2]
92-
; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #8
93-
; CHECK-NEXT: mov v6.d[1], v7.d[1]
94-
; CHECK-NEXT: mov v3.d[1], v5.d[1]
95-
; CHECK-NEXT: uzp1 v1.4s, v4.4s, v0.4s
96-
; CHECK-NEXT: uzp2 v4.4s, v4.4s, v0.4s
91+
; CHECK-NEXT: ext v7.16b, v2.16b, v2.16b, #8
92+
; CHECK-NEXT: ext v5.16b, v5.16b, v16.16b, #8
93+
; CHECK-NEXT: mov v3.d[1], v4.d[1]
94+
; CHECK-NEXT: uzp1 v1.4s, v7.4s, v0.4s
95+
; CHECK-NEXT: uzp2 v4.4s, v7.4s, v0.4s
96+
; CHECK-NEXT: mov v6.d[1], v5.d[1]
9797
; CHECK-NEXT: addp v0.4s, v2.4s, v0.4s
98+
; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s
99+
; CHECK-NEXT: rev64 v7.4s, v0.4s
98100
; CHECK-NEXT: add v5.4s, v3.4s, v6.4s
99101
; CHECK-NEXT: sub v3.4s, v6.4s, v3.4s
100-
; CHECK-NEXT: rev64 v7.4s, v0.4s
101-
; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s
102+
; CHECK-NEXT: rev64 v2.4s, v1.4s
102103
; CHECK-NEXT: rev64 v4.4s, v5.4s
103104
; CHECK-NEXT: rev64 v6.4s, v3.4s
104105
; CHECK-NEXT: addp v16.4s, v0.4s, v5.4s
105-
; CHECK-NEXT: rev64 v2.4s, v1.4s
106106
; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s
107107
; CHECK-NEXT: zip1 v21.4s, v16.4s, v16.4s
108108
; CHECK-NEXT: sub v4.4s, v5.4s, v4.4s

0 commit comments

Comments
 (0)