Skip to content

Commit 031fb74

Browse files
authored
[AArch64][SME] Preserve Chain when selecting multi-vector LUT4Is (#161494)
Previously, the `Chain` was dropped meaning LUTI4 nodes that only differed in the chain operand would be incorrectly CSE'd. Fixes: #161420
1 parent 6e52e53 commit 031fb74

File tree

4 files changed

+23
-12
lines changed

4 files changed

+23
-12
lines changed

llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2089,7 +2089,8 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
20892089
if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
20902090
return;
20912091

2092-
SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)};
2092+
SDValue Chain = Node->getOperand(0);
2093+
SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4), Chain};
20932094
SDLoc DL(Node);
20942095
EVT VT = Node->getValueType(0);
20952096

@@ -2110,14 +2111,15 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
21102111
void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
21112112
unsigned NumOutVecs,
21122113
unsigned Opc) {
2113-
21142114
SDValue ZtValue;
2115-
SmallVector<SDValue, 4> Ops;
21162115
if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
21172116
return;
21182117

2119-
Ops.push_back(ZtValue);
2120-
Ops.push_back(createZMulTuple({Node->getOperand(3), Node->getOperand(4)}));
2118+
SDValue Chain = Node->getOperand(0);
2119+
SDValue Ops[] = {ZtValue,
2120+
createZMulTuple({Node->getOperand(3), Node->getOperand(4)}),
2121+
Chain};
2122+
21212123
SDLoc DL(Node);
21222124
EVT VT = Node->getValueType(0);
21232125

llvm/test/CodeGen/AArch64/pr161420.ll

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,20 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"
55
target triple = "arm64-apple-macosx15.0.0"
66

77
; From: https://github.com/llvm/llvm-project/issues/161420. This test checks that
8-
; two `luti4` instructions are emitted. FIXME: This is currently broken!
8+
; two `luti4` instructions are emitted.
99
define void @pluto(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3) #0 {
1010
; CHECK-LABEL: pluto:
1111
; CHECK: ; %bb.0: ; %bb
1212
; CHECK-NEXT: mov w8, #0 ; =0x0
1313
; CHECK-NEXT: ldr zt0, [x1]
14-
; CHECK-NEXT: ldr z0, [x3]
14+
; CHECK-NEXT: ldr z4, [x3]
1515
; CHECK-NEXT: ptrue pn8.h
16-
; CHECK-NEXT: ld1h { z4.h - z7.h }, pn8/z, [x0]
17-
; CHECK-NEXT: luti4 { z0.h - z3.h }, zt0, z0[0]
18-
; CHECK-NEXT: fmla za.h[w8, 2, vgx4], { z4.h - z7.h }, { z0.h - z3.h }
16+
; CHECK-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0]
17+
; CHECK-NEXT: luti4 { z16.h - z19.h }, zt0, z4[0]
18+
; CHECK-NEXT: fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, { z16.h - z19.h }
19+
; CHECK-NEXT: ldr zt0, [x2]
20+
; CHECK-NEXT: luti4 { z4.h - z7.h }, zt0, z4[0]
21+
; CHECK-NEXT: fmla za.h[w8, 2, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
1922
; CHECK-NEXT: ret
2023
bb:
2124
tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %arg1)

llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,13 @@ define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscal
4949
}
5050

5151
; Tests multiple identical luti4 intrinsics with ZT0 loads interspersed, are not CSD'd.
52-
; FIXME: This is currently broken!
5352
define void @test_multiple_luti4_zt_i8(ptr %ptrA, ptr %ptrB, <vscale x 16 x i8> %x) {
5453
; CHECK-LABEL: test_multiple_luti4_zt_i8:
5554
; CHECK: // %bb.0:
55+
; CHECK-NEXT: ldr zt0, [x0]
56+
; CHECK-NEXT: luti4 { z4.s - z7.s }, zt0, z0[1]
57+
; CHECK-NEXT: // fake_use: $z4 $z4_z5_z6_z7
58+
; CHECK-NEXT: ldr zt0, [x1]
5659
; CHECK-NEXT: luti4 { z0.s - z3.s }, zt0, z0[1]
5760
; CHECK-NEXT: // fake_use: $z0 $z0_z1_z2_z3
5861
; CHECK-NEXT: ret

llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,15 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16
1515
}
1616

1717
; Tests multiple identical luti4 intrinsics with ZT0 loads interspersed, are not CSD'd.
18-
; FIXME: This is currently broken!
1918
define void @test_multiple_luti4_zt_i8(ptr %ptrA, ptr %ptrB, <vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1) #0 {
2019
; CHECK-LABEL: test_multiple_luti4_zt_i8:
2120
; CHECK: // %bb.0:
21+
; CHECK-NEXT: ldr zt0, [x0]
2222
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
2323
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
24+
; CHECK-NEXT: luti4 { z4.b - z7.b }, zt0, { z0, z1 }
25+
; CHECK-NEXT: // fake_use: $z4 $z4_z5_z6_z7
26+
; CHECK-NEXT: ldr zt0, [x1]
2427
; CHECK-NEXT: luti4 { z0.b - z3.b }, zt0, { z0, z1 }
2528
; CHECK-NEXT: // fake_use: $z0 $z0_z1_z2_z3
2629
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)