Skip to content

Commit b481512

Browse files
[SVE] Move reg+reg gather/scatter addressing optimisations from lowering into DAG combine.
This is essentially a refactoring patch but allows more cases to be caught, hence the output changes to some tests. Differential Revision: https://reviews.llvm.org/D122994
1 parent f927be0 commit b481512

File tree

4 files changed

+56
-113
lines changed

4 files changed

+56
-113
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 25 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -4651,63 +4651,29 @@ bool getGatherScatterIndexIsExtended(SDValue Index) {
46514651
return false;
46524652
}
46534653

4654-
// If the base pointer of a masked gather or scatter is null, we
4655-
// may be able to swap BasePtr & Index and use the vector + register
4656-
// or vector + immediate addressing mode, e.g.
4657-
// VECTOR + REGISTER:
4658-
// getelementptr nullptr, <vscale x N x T> (splat(%offset)) + %indices)
4659-
// -> getelementptr %offset, <vscale x N x T> %indices
4654+
// If the base pointer of a masked gather or scatter is constant, we
4655+
// may be able to swap BasePtr & Index and use the vector + immediate addressing
4656+
// mode, e.g.
46604657
// VECTOR + IMMEDIATE:
46614658
// getelementptr nullptr, <vscale x N x T> (splat(#x)) + %indices)
46624659
// -> getelementptr #x, <vscale x N x T> %indices
46634660
void selectGatherScatterAddrMode(SDValue &BasePtr, SDValue &Index,
46644661
bool IsScaled, EVT MemVT, unsigned &Opcode,
46654662
bool IsGather, SelectionDAG &DAG) {
4666-
if (!isNullConstant(BasePtr) || IsScaled)
4663+
ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(BasePtr);
4664+
if (!Offset || IsScaled)
46674665
return;
46684666

4669-
// FIXME: This will not match for fixed vector type codegen as the nodes in
4670-
// question will have fixed<->scalable conversions around them. This should be
4671-
// moved to a DAG combine or complex pattern so that is executes after all of
4672-
// the fixed vector insert and extracts have been removed. This deficiency
4673-
// will result in a sub-optimal addressing mode being used, i.e. an ADD not
4674-
// being folded into the scatter/gather.
4675-
ConstantSDNode *Offset = nullptr;
4676-
if (Index.getOpcode() == ISD::ADD)
4677-
if (auto SplatVal = DAG.getSplatValue(Index.getOperand(1))) {
4678-
if (isa<ConstantSDNode>(SplatVal))
4679-
Offset = cast<ConstantSDNode>(SplatVal);
4680-
else {
4681-
BasePtr = SplatVal;
4682-
Index = Index->getOperand(0);
4683-
return;
4684-
}
4685-
}
4686-
4687-
unsigned NewOp =
4688-
IsGather ? AArch64ISD::GLD1_IMM_MERGE_ZERO : AArch64ISD::SST1_IMM_PRED;
4689-
4690-
if (!Offset) {
4691-
std::swap(BasePtr, Index);
4692-
Opcode = NewOp;
4693-
return;
4694-
}
4695-
46964667
uint64_t OffsetVal = Offset->getZExtValue();
46974668
unsigned ScalarSizeInBytes = MemVT.getScalarSizeInBits() / 8;
4698-
auto ConstOffset = DAG.getConstant(OffsetVal, SDLoc(Index), MVT::i64);
46994669

4700-
if (OffsetVal % ScalarSizeInBytes || OffsetVal / ScalarSizeInBytes > 31) {
4701-
// Index is out of range for the immediate addressing mode
4702-
BasePtr = ConstOffset;
4703-
Index = Index->getOperand(0);
4670+
if (OffsetVal % ScalarSizeInBytes || OffsetVal / ScalarSizeInBytes > 31)
47044671
return;
4705-
}
47064672

47074673
// Immediate is in range
4708-
Opcode = NewOp;
4709-
BasePtr = Index->getOperand(0);
4710-
Index = ConstOffset;
4674+
Opcode =
4675+
IsGather ? AArch64ISD::GLD1_IMM_MERGE_ZERO : AArch64ISD::SST1_IMM_PRED;
4676+
std::swap(BasePtr, Index);
47114677
}
47124678

47134679
SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
@@ -17136,43 +17102,43 @@ static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
1713617102
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
1713717103
SDValue &BasePtr, SDValue &Index,
1713817104
SelectionDAG &DAG) {
17105+
// Try to iteratively fold parts of the index into the base pointer to
17106+
// simplify the index as much as possible.
17107+
bool Changed = false;
17108+
while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
17109+
Changed = true;
17110+
1713917111
// Only consider element types that are pointer sized as smaller types can
1714017112
// be easily promoted.
1714117113
EVT IndexVT = Index.getValueType();
1714217114
if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
17143-
return false;
17144-
17145-
// Try to iteratively fold parts of the index into the base pointer to
17146-
// simplify the index as much as possible.
17147-
SDValue NewBasePtr = BasePtr, NewIndex = Index;
17148-
while (foldIndexIntoBase(NewBasePtr, NewIndex, N->getScale(), SDLoc(N), DAG))
17149-
;
17115+
return Changed;
1715017116

1715117117
// Match:
1715217118
// Index = step(const)
1715317119
int64_t Stride = 0;
17154-
if (NewIndex.getOpcode() == ISD::STEP_VECTOR)
17155-
Stride = cast<ConstantSDNode>(NewIndex.getOperand(0))->getSExtValue();
17120+
if (Index.getOpcode() == ISD::STEP_VECTOR)
17121+
Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
1715617122

1715717123
// Match:
1715817124
// Index = step(const) << shift(const)
17159-
else if (NewIndex.getOpcode() == ISD::SHL &&
17160-
NewIndex.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
17161-
SDValue RHS = NewIndex.getOperand(1);
17125+
else if (Index.getOpcode() == ISD::SHL &&
17126+
Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
17127+
SDValue RHS = Index.getOperand(1);
1716217128
if (auto *Shift =
1716317129
dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
17164-
int64_t Step = (int64_t)NewIndex.getOperand(0).getConstantOperandVal(1);
17130+
int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
1716517131
Stride = Step << Shift->getZExtValue();
1716617132
}
1716717133
}
1716817134

1716917135
// Return early because no supported pattern is found.
1717017136
if (Stride == 0)
17171-
return false;
17137+
return Changed;
1717217138

1717317139
if (Stride < std::numeric_limits<int32_t>::min() ||
1717417140
Stride > std::numeric_limits<int32_t>::max())
17175-
return false;
17141+
return Changed;
1717617142

1717717143
const auto &Subtarget =
1717817144
static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
@@ -17183,14 +17149,13 @@ static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
1718317149

1718417150
if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
1718517151
LastElementOffset > std::numeric_limits<int32_t>::max())
17186-
return false;
17152+
return Changed;
1718717153

1718817154
EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
1718917155
// Stride does not scale explicitly by 'Scale', because it happens in
1719017156
// the gather/scatter addressing mode.
1719117157
Index = DAG.getNode(ISD::STEP_VECTOR, SDLoc(N), NewIndexVT,
1719217158
DAG.getTargetConstant(Stride, SDLoc(N), MVT::i32));
17193-
BasePtr = NewBasePtr;
1719417159
return true;
1719517160
}
1719617161

llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1155,19 +1155,16 @@ define void @masked_gather_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %b
11551155
ret void
11561156
}
11571157

1158-
; FIXME: This case does not yet codegen well due to deficiencies in opcode selection
11591158
define void @masked_gather_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %off) #0 {
11601159
; VBITS_GE_2048-LABEL: masked_gather_vec_plus_reg:
11611160
; VBITS_GE_2048: // %bb.0:
11621161
; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
11631162
; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
11641163
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
11651164
; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
1166-
; VBITS_GE_2048-NEXT: mov z2.d, x2
11671165
; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
1168-
; VBITS_GE_2048-NEXT: add z0.d, z1.d, z2.d
11691166
; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b
1170-
; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z0.d]
1167+
; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x2, z1.d]
11711168
; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
11721169
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
11731170
; VBITS_GE_2048-NEXT: ret
@@ -1181,7 +1178,6 @@ define void @masked_gather_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %o
11811178
ret void
11821179
}
11831180

1184-
; FIXME: This case does not yet codegen well due to deficiencies in opcode selection
11851181
define void @masked_gather_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 {
11861182
; VBITS_GE_2048-LABEL: masked_gather_vec_plus_imm:
11871183
; VBITS_GE_2048: // %bb.0:
@@ -1190,9 +1186,8 @@ define void @masked_gather_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 {
11901186
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
11911187
; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
11921188
; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
1193-
; VBITS_GE_2048-NEXT: add z1.d, z1.d, #4
11941189
; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b
1195-
; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d]
1190+
; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d, #4]
11961191
; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
11971192
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
11981193
; VBITS_GE_2048-NEXT: ret

llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1051,20 +1051,17 @@ define void @masked_scatter_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %
10511051
ret void
10521052
}
10531053

1054-
; FIXME: This case does not yet codegen well due to deficiencies in opcode selection
10551054
define void @masked_scatter_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %off) #0 {
10561055
; VBITS_GE_2048-LABEL: masked_scatter_vec_plus_reg:
10571056
; VBITS_GE_2048: // %bb.0:
10581057
; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
10591058
; VBITS_GE_2048-NEXT: ptrue p1.d, vl32
10601059
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
10611060
; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
1062-
; VBITS_GE_2048-NEXT: mov z2.d, x2
10631061
; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
1064-
; VBITS_GE_2048-NEXT: add z1.d, z1.d, z2.d
10651062
; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
10661063
; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
1067-
; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d]
1064+
; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x2, z1.d]
10681065
; VBITS_GE_2048-NEXT: ret
10691066
%vals = load <32 x float>, <32 x float>* %a
10701067
%bases = load <32 x i8*>, <32 x i8*>* %b
@@ -1075,7 +1072,6 @@ define void @masked_scatter_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %
10751072
ret void
10761073
}
10771074

1078-
; FIXME: This case does not yet codegen well due to deficiencies in opcode selection
10791075
define void @masked_scatter_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 {
10801076
; VBITS_GE_2048-LABEL: masked_scatter_vec_plus_imm:
10811077
; VBITS_GE_2048: // %bb.0:
@@ -1084,10 +1080,9 @@ define void @masked_scatter_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 {
10841080
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
10851081
; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1]
10861082
; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
1087-
; VBITS_GE_2048-NEXT: add z1.d, z1.d, #4
10881083
; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
10891084
; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
1090-
; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d]
1085+
; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d, #4]
10911086
; VBITS_GE_2048-NEXT: ret
10921087
%vals = load <32 x float>, <32 x float>* %a
10931088
%bases = load <32 x i8*>, <32 x i8*>* %b

llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll

Lines changed: 27 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -105,20 +105,18 @@ define void @scatter_i8_index_offset_maximum_plus_one(i8* %base, i64 %offset, <v
105105
; CHECK-NEXT: rdvl x8, #1
106106
; CHECK-NEXT: mov w9, #67108864
107107
; CHECK-NEXT: lsr x8, x8, #4
108-
; CHECK-NEXT: mov z1.d, x1
108+
; CHECK-NEXT: add x10, x0, x1
109109
; CHECK-NEXT: punpklo p1.h, p0.b
110-
; CHECK-NEXT: punpkhi p0.h, p0.b
110+
; CHECK-NEXT: uunpklo z3.d, z0.s
111111
; CHECK-NEXT: mul x8, x8, x9
112112
; CHECK-NEXT: mov w9, #33554432
113-
; CHECK-NEXT: index z2.d, #0, x9
114-
; CHECK-NEXT: mov z3.d, x8
115-
; CHECK-NEXT: add z3.d, z2.d, z3.d
116-
; CHECK-NEXT: add z2.d, z2.d, z1.d
117-
; CHECK-NEXT: add z1.d, z3.d, z1.d
118-
; CHECK-NEXT: uunpklo z3.d, z0.s
113+
; CHECK-NEXT: punpkhi p0.h, p0.b
119114
; CHECK-NEXT: uunpkhi z0.d, z0.s
120-
; CHECK-NEXT: st1b { z3.d }, p1, [x0, z2.d]
121-
; CHECK-NEXT: st1b { z0.d }, p0, [x0, z1.d]
115+
; CHECK-NEXT: index z1.d, #0, x9
116+
; CHECK-NEXT: mov z2.d, x8
117+
; CHECK-NEXT: st1b { z3.d }, p1, [x10, z1.d]
118+
; CHECK-NEXT: add z2.d, z1.d, z2.d
119+
; CHECK-NEXT: st1b { z0.d }, p0, [x10, z2.d]
122120
; CHECK-NEXT: ret
123121
%t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
124122
%t1 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
@@ -140,20 +138,18 @@ define void @scatter_i8_index_offset_minimum_minus_one(i8* %base, i64 %offset, <
140138
; CHECK-NEXT: mov x9, #-2
141139
; CHECK-NEXT: lsr x8, x8, #4
142140
; CHECK-NEXT: movk x9, #64511, lsl #16
143-
; CHECK-NEXT: mov z1.d, x1
141+
; CHECK-NEXT: add x10, x0, x1
144142
; CHECK-NEXT: punpklo p1.h, p0.b
145143
; CHECK-NEXT: mul x8, x8, x9
146144
; CHECK-NEXT: mov x9, #-33554433
147-
; CHECK-NEXT: punpkhi p0.h, p0.b
148-
; CHECK-NEXT: index z2.d, #0, x9
149-
; CHECK-NEXT: mov z3.d, x8
150-
; CHECK-NEXT: add z3.d, z2.d, z3.d
151-
; CHECK-NEXT: add z2.d, z2.d, z1.d
152-
; CHECK-NEXT: add z1.d, z3.d, z1.d
153145
; CHECK-NEXT: uunpklo z3.d, z0.s
146+
; CHECK-NEXT: punpkhi p0.h, p0.b
154147
; CHECK-NEXT: uunpkhi z0.d, z0.s
155-
; CHECK-NEXT: st1b { z3.d }, p1, [x0, z2.d]
156-
; CHECK-NEXT: st1b { z0.d }, p0, [x0, z1.d]
148+
; CHECK-NEXT: index z1.d, #0, x9
149+
; CHECK-NEXT: mov z2.d, x8
150+
; CHECK-NEXT: st1b { z3.d }, p1, [x10, z1.d]
151+
; CHECK-NEXT: add z2.d, z1.d, z2.d
152+
; CHECK-NEXT: st1b { z0.d }, p0, [x10, z2.d]
157153
; CHECK-NEXT: ret
158154
%t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
159155
%t1 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
@@ -174,20 +170,18 @@ define void @scatter_i8_index_stride_too_big(i8* %base, i64 %offset, <vscale x 4
174170
; CHECK-NEXT: rdvl x8, #1
175171
; CHECK-NEXT: mov x9, #-9223372036854775808
176172
; CHECK-NEXT: lsr x8, x8, #4
177-
; CHECK-NEXT: mov z1.d, x1
173+
; CHECK-NEXT: add x10, x0, x1
178174
; CHECK-NEXT: punpklo p1.h, p0.b
179-
; CHECK-NEXT: punpkhi p0.h, p0.b
175+
; CHECK-NEXT: uunpklo z3.d, z0.s
180176
; CHECK-NEXT: mul x8, x8, x9
181177
; CHECK-NEXT: mov x9, #4611686018427387904
182-
; CHECK-NEXT: index z2.d, #0, x9
183-
; CHECK-NEXT: mov z3.d, x8
184-
; CHECK-NEXT: add z3.d, z2.d, z3.d
185-
; CHECK-NEXT: add z2.d, z2.d, z1.d
186-
; CHECK-NEXT: add z1.d, z3.d, z1.d
187-
; CHECK-NEXT: uunpklo z3.d, z0.s
178+
; CHECK-NEXT: punpkhi p0.h, p0.b
188179
; CHECK-NEXT: uunpkhi z0.d, z0.s
189-
; CHECK-NEXT: st1b { z3.d }, p1, [x0, z2.d]
190-
; CHECK-NEXT: st1b { z0.d }, p0, [x0, z1.d]
180+
; CHECK-NEXT: index z1.d, #0, x9
181+
; CHECK-NEXT: mov z2.d, x8
182+
; CHECK-NEXT: st1b { z3.d }, p1, [x10, z1.d]
183+
; CHECK-NEXT: add z2.d, z1.d, z2.d
184+
; CHECK-NEXT: st1b { z0.d }, p0, [x10, z2.d]
191185
; CHECK-NEXT: ret
192186
%t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
193187
%t1 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
@@ -346,9 +340,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i64_const_with_vec_offsets(<vscale
346340
define <vscale x 2 x i64> @masked_gather_nxv2i64_null_with_vec_plus_scalar_offsets(<vscale x 2 x i64> %vector_offsets, i64 %scalar_offset, <vscale x 2 x i1> %pg) #0 {
347341
; CHECK-LABEL: masked_gather_nxv2i64_null_with_vec_plus_scalar_offsets:
348342
; CHECK: // %bb.0:
349-
; CHECK-NEXT: mov x8, xzr
350-
; CHECK-NEXT: mov z1.d, x0
351-
; CHECK-NEXT: add z0.d, z0.d, z1.d
343+
; CHECK-NEXT: lsl x8, x0, #3
352344
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d, lsl #3]
353345
; CHECK-NEXT: ret
354346
%scalar_offset.ins = insertelement <vscale x 2 x i64> undef, i64 %scalar_offset, i64 0
@@ -362,8 +354,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i64_null_with_vec_plus_scalar_offse
362354
define <vscale x 2 x i64> @masked_gather_nxv2i64_null_with__vec_plus_imm_offsets(<vscale x 2 x i64> %vector_offsets, <vscale x 2 x i1> %pg) #0 {
363355
; CHECK-LABEL: masked_gather_nxv2i64_null_with__vec_plus_imm_offsets:
364356
; CHECK: // %bb.0:
365-
; CHECK-NEXT: mov x8, xzr
366-
; CHECK-NEXT: add z0.d, z0.d, #1 // =0x1
357+
; CHECK-NEXT: mov w8, #8
367358
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d, lsl #3]
368359
; CHECK-NEXT: ret
369360
%scalar_offset.ins = insertelement <vscale x 2 x i64> undef, i64 1, i64 0
@@ -427,9 +418,7 @@ define void @masked_scatter_nxv2i64_const_with_vec_offsets(<vscale x 2 x i64> %v
427418
define void @masked_scatter_nxv2i64_null_with_vec_plus_scalar_offsets(<vscale x 2 x i64> %vector_offsets, i64 %scalar_offset, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %data) #0 {
428419
; CHECK-LABEL: masked_scatter_nxv2i64_null_with_vec_plus_scalar_offsets:
429420
; CHECK: // %bb.0:
430-
; CHECK-NEXT: mov x8, xzr
431-
; CHECK-NEXT: mov z2.d, x0
432-
; CHECK-NEXT: add z0.d, z0.d, z2.d
421+
; CHECK-NEXT: lsl x8, x0, #3
433422
; CHECK-NEXT: st1d { z1.d }, p0, [x8, z0.d, lsl #3]
434423
; CHECK-NEXT: ret
435424
%scalar_offset.ins = insertelement <vscale x 2 x i64> undef, i64 %scalar_offset, i64 0
@@ -443,8 +432,7 @@ define void @masked_scatter_nxv2i64_null_with_vec_plus_scalar_offsets(<vscale x
443432
define void @masked_scatter_nxv2i64_null_with__vec_plus_imm_offsets(<vscale x 2 x i64> %vector_offsets, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %data) #0 {
444433
; CHECK-LABEL: masked_scatter_nxv2i64_null_with__vec_plus_imm_offsets:
445434
; CHECK: // %bb.0:
446-
; CHECK-NEXT: mov x8, xzr
447-
; CHECK-NEXT: add z0.d, z0.d, #1 // =0x1
435+
; CHECK-NEXT: mov w8, #8
448436
; CHECK-NEXT: st1d { z1.d }, p0, [x8, z0.d, lsl #3]
449437
; CHECK-NEXT: ret
450438
%scalar_offset.ins = insertelement <vscale x 2 x i64> undef, i64 1, i64 0

0 commit comments

Comments
 (0)