Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14815,9 +14815,10 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
}
if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
if (isTRNMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2,
OperandOrder == 0 ? V2 : V1);
}

if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
Expand Down Expand Up @@ -16529,7 +16530,7 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
isREVMask(M, EltSize, NumElts, 16) ||
isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
isSingletonEXTMask(M, VT, DummyUnsigned) ||
isTRNMask(M, NumElts, DummyUnsigned) ||
isTRNMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
isUZPMask(M, NumElts, DummyUnsigned) ||
isZIPMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
Expand Down Expand Up @@ -31588,10 +31589,13 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
OperandOrder == 0 ? Op1 : Op2,
OperandOrder == 0 ? Op2 : Op1));

if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
OperandOrder)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
return convertFromScalableVector(
DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
DAG, VT,
DAG.getNode(Opc, DL, ContainerVT, OperandOrder == 0 ? Op1 : Op2,
OperandOrder == 0 ? Op2 : Op1));
Copy link
Member

@MacDue MacDue Dec 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Maybe add SDValue TRN = Dag.getNode(...) since this convertFromScalableVector call has got a little long.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in commit 3.

}

if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
Expand Down
51 changes: 35 additions & 16 deletions llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
Original file line number Diff line number Diff line change
Expand Up @@ -6699,33 +6699,52 @@ inline bool isUZPMask(ArrayRef<int> M, unsigned NumElts,
}

/// Return true for trn1 or trn2 masks of the form:
/// <0, 8, 2, 10, 4, 12, 6, 14> or
/// <1, 9, 3, 11, 5, 13, 7, 15>
/// <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0, OperandOrderOut = 0) or
/// <1, 9, 3, 11, 5, 13, 7, 15> (WhichResultOut = 1, OperandOrderOut = 0) or
/// <8, 0, 10, 2, 12, 4, 14, 6> (WhichResultOut = 0, OperandOrderOut = 1) or
/// <9, 1, 11, 3, 13, 5, 15, 7> (WhichResultOut = 1, OperandOrderOut = 1) or
inline bool isTRNMask(ArrayRef<int> M, unsigned NumElts,
unsigned &WhichResultOut) {
unsigned &WhichResultOut, unsigned &OperandOrderOut) {
if (NumElts % 2 != 0)
return false;
// Check the first non-undef element for trn1 vs trn2.
unsigned WhichResult = 2;

// "Variant" refers to the distinction bwetween trn1 and trn2, while
// "Order" refers to sequence of input registers (matching vs flipped).
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: This comment needs updating. I'd maybe phrase it like:

"Result" corresponds to "WhichResultOut", which selects between trn1 and trn2.
"Order" corresponds to "OperandOrderOut", which select the order of operands for the instruction (flipped or not).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have updated the comment in commit 5.

bool Variant0Order0 = true; // WhichResultOut = 0, OperandOrderOut = 0
bool Variant1Order0 = true; // WhichResultOut = 1, OperandOrderOut = 0
bool Variant0Order1 = true; // WhichResultOut = 0, OperandOrderOut = 1
bool Variant1Order1 = true; // WhichResultOut = 1, OperandOrderOut = 1
Copy link
Member

@MacDue MacDue Dec 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Maybe replace Variant with Result (since that more closely maps to WhichResultOut)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hesitated, because I think Variant describes the purpose better than Result. I considered some alternatives:

  • WhichResult0Order0 etc is too long and makes the code harder to read due to different formatting decisions;
  • instead renaming the function parameter VariantOut would warrant updating the call sites, which then leads to also renaming the parameter in isZIPMask and isUZPMask, which doesn't belong in this PR;

in the end, I couldn't come up with something better, so followed your suggestion in commit 4.

// Check all elements match.
for (unsigned i = 0; i != NumElts; i += 2) {
if (M[i] >= 0) {
WhichResult = ((unsigned)M[i] == i ? 0 : 1);
break;
unsigned EvenElt = (unsigned)M[i];
if (EvenElt != i)
Variant0Order0 = false;
if (EvenElt != i + 1)
Variant1Order0 = false;
if (EvenElt != NumElts + i)
Variant0Order1 = false;
if (EvenElt != NumElts + i + 1)
Variant1Order1 = false;
}
if (M[i + 1] >= 0) {
WhichResult = ((unsigned)M[i + 1] == i + NumElts ? 0 : 1);
break;
unsigned OddElt = (unsigned)M[i + 1];
if (OddElt != NumElts + i)
Variant0Order0 = false;
if (OddElt != NumElts + i + 1)
Variant1Order0 = false;
if (OddElt != i)
Variant0Order1 = false;
if (OddElt != i + 1)
Variant1Order1 = false;
}
}
if (WhichResult == 2)

if (Variant0Order0 + Variant1Order0 + Variant0Order1 + Variant1Order1 != 1)
return false;

for (unsigned i = 0; i < NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
return false;
}
WhichResultOut = WhichResult;
WhichResultOut = (Variant0Order0 || Variant0Order1) ? 0 : 1;
OperandOrderOut = (Variant0Order0 || Variant1Order0) ? 0 : 1;
return true;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,14 +215,15 @@ bool matchTRN(MachineInstr &MI, MachineRegisterInfo &MRI,
ShuffleVectorPseudo &MatchInfo) {
assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
unsigned WhichResult;
unsigned OperandOrder;
ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
Register Dst = MI.getOperand(0).getReg();
unsigned NumElts = MRI.getType(Dst).getNumElements();
if (!isTRNMask(ShuffleMask, NumElts, WhichResult))
if (!isTRNMask(ShuffleMask, NumElts, WhichResult, OperandOrder))
return false;
unsigned Opc = (WhichResult == 0) ? AArch64::G_TRN1 : AArch64::G_TRN2;
Register V1 = MI.getOperand(1).getReg();
Register V2 = MI.getOperand(2).getReg();
Register V1 = MI.getOperand(OperandOrder == 0 ? 1 : 2).getReg();
Register V2 = MI.getOperand(OperandOrder == 0 ? 2 : 1).getReg();
MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
return true;
}
Expand Down
81 changes: 81 additions & 0 deletions llvm/test/CodeGen/AArch64/arm64-trn.ll
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,87 @@ define <4 x float> @vtrnQf(ptr %A, ptr %B) nounwind {
ret <4 x float> %tmp5
}

define <8 x i8> @vtrni8_8first(ptr %A, ptr %B) nounwind {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be nice if the test names somewhat follow the possible cases (or at least some comments for them). E.g. vtrn2_flipped, vtrn1, vtrn1_flipped.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was trying to follow the naming conventions from existing tests, but gladly renamed in commit 2.

; CHECKLE-LABEL: vtrni8_8first:
; CHECKLE: // %bb.0:
; CHECKLE-NEXT: ldr d0, [x0]
; CHECKLE-NEXT: ldr d1, [x1]
; CHECKLE-NEXT: trn1 v2.8b, v1.8b, v0.8b
; CHECKLE-NEXT: trn2 v0.8b, v0.8b, v1.8b
; CHECKLE-NEXT: add v0.8b, v2.8b, v0.8b
; CHECKLE-NEXT: ret
;
; CHECKBE-LABEL: vtrni8_8first:
; CHECKBE: // %bb.0:
; CHECKBE-NEXT: ld1 { v0.8b }, [x0]
; CHECKBE-NEXT: ld1 { v1.8b }, [x1]
; CHECKBE-NEXT: trn1 v2.8b, v1.8b, v0.8b
; CHECKBE-NEXT: trn2 v0.8b, v0.8b, v1.8b
; CHECKBE-NEXT: add v0.8b, v2.8b, v0.8b
; CHECKBE-NEXT: rev64 v0.8b, v0.8b
; CHECKBE-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do all these tests need to load, or can they take input args?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was trying to match surrounding test cases as closely as possible, all of which seem to have the load instructions and slightly broken formatting. I gladly simplified and reformatted in commit 2.

%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 8, i32 0, i32 10, i32 2, i32 12, i32 4, i32 14, i32 6>
%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
%tmp5 = add <8 x i8> %tmp3, %tmp4
ret <8 x i8> %tmp5
}

define <8 x i8> @vtrni8_9first(ptr %A, ptr %B) nounwind {
; CHECKLE-LABEL: vtrni8_9first:
; CHECKLE: // %bb.0:
; CHECKLE-NEXT: ldr d0, [x0]
; CHECKLE-NEXT: ldr d1, [x1]
; CHECKLE-NEXT: trn1 v2.8b, v1.8b, v0.8b
; CHECKLE-NEXT: trn2 v0.8b, v1.8b, v0.8b
; CHECKLE-NEXT: add v0.8b, v2.8b, v0.8b
; CHECKLE-NEXT: ret
;
; CHECKBE-LABEL: vtrni8_9first:
; CHECKBE: // %bb.0:
; CHECKBE-NEXT: ld1 { v0.8b }, [x0]
; CHECKBE-NEXT: ld1 { v1.8b }, [x1]
; CHECKBE-NEXT: trn1 v2.8b, v1.8b, v0.8b
; CHECKBE-NEXT: trn2 v0.8b, v1.8b, v0.8b
; CHECKBE-NEXT: add v0.8b, v2.8b, v0.8b
; CHECKBE-NEXT: rev64 v0.8b, v0.8b
; CHECKBE-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 8, i32 0, i32 10, i32 2, i32 12, i32 4, i32 14, i32 6>
%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 9, i32 1, i32 11, i32 3, i32 13, i32 5, i32 15, i32 7>
%tmp5 = add <8 x i8> %tmp3, %tmp4
ret <8 x i8> %tmp5
}

define <8 x i8> @vtrni8_89first_undef(ptr %A, ptr %B) nounwind {
; CHECKLE-LABEL: vtrni8_89first_undef:
; CHECKLE: // %bb.0:
; CHECKLE-NEXT: ldr d0, [x0]
; CHECKLE-NEXT: ldr d1, [x1]
; CHECKLE-NEXT: trn1 v2.8b, v1.8b, v0.8b
; CHECKLE-NEXT: trn2 v0.8b, v1.8b, v0.8b
; CHECKLE-NEXT: add v0.8b, v2.8b, v0.8b
; CHECKLE-NEXT: ret
;
; CHECKBE-LABEL: vtrni8_89first_undef:
; CHECKBE: // %bb.0:
; CHECKBE-NEXT: ld1 { v0.8b }, [x0]
; CHECKBE-NEXT: ld1 { v1.8b }, [x1]
; CHECKBE-NEXT: trn1 v2.8b, v1.8b, v0.8b
; CHECKBE-NEXT: trn2 v0.8b, v1.8b, v0.8b
; CHECKBE-NEXT: add v0.8b, v2.8b, v0.8b
; CHECKBE-NEXT: rev64 v0.8b, v0.8b
; CHECKBE-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 8, i32 0, i32 poison, i32 2, i32 poison, i32 4, i32 14, i32 6>
%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 9, i32 1, i32 poison, i32 3, i32 13, i32 5, i32 15, i32 poison>
%tmp5 = add <8 x i8> %tmp3, %tmp4
ret <8 x i8> %tmp5
}

; Undef shuffle indices (even at the start of the shuffle mask) should not prevent matching to VTRN:

define <8 x i8> @vtrni8_undef(ptr %A, ptr %B) nounwind {
Expand Down
8 changes: 3 additions & 5 deletions llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,10 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
; CHECK-SD-LABEL: vector_deinterleave_v2f16_v4f16:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: dup v2.2s, v0.s[1]
; CHECK-SD-NEXT: mov v1.16b, v2.16b
; CHECK-SD-NEXT: zip1 v2.4h, v0.4h, v2.4h
; CHECK-SD-NEXT: mov v1.h[0], v0.h[1]
; CHECK-SD-NEXT: dup v1.2s, v0.s[1]
; CHECK-SD-NEXT: zip1 v2.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: trn2 v1.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: fmov d0, d2
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: vector_deinterleave_v2f16_v4f16:
Expand Down
Loading