Skip to content

Commit 269e53a

Browse files
committed
[LV][AArch64] Improve strided access vectorization for AArch64 SVE
Currently, LLVM vectorizes strided accesses with SVE as follows. ```c void func(double* restrict a, double* b, int n) { for (int i = 0; i < n; i++) { a[i] = b[i * 10] + 1; } } ``` => ``` ... index z1.d, #0, llvm#1 loop: add z2.d, z1.d, z0.d mul z1.d, z1.d, llvm#80 ld1d { z1.d }, p0/z, [x1, z1.d] ... mov z1.d, z2.d ... ``` This generated code is inefficient because it performs address calculation inside the loop using vector instructions. This can lead to performance degradation. llvm#129474 Ideally, we want to generate efficient instructions that keep the offset vector `z1` constant and update the base register `x1` with a scalar instruction. ``` ... index z1.d, #0, llvm#10 loop: ld1d z2.d, p7/z, [x1, z1.d, lsl 3] ... add x1, x1, x2 ... ``` This patch enables strided accesses to be vectorized efficiently as shown above. This patch is based on llvm#147297. llvm#147297 detects strided accesses and converts them into stride recipes. This patch then changes it to a legal and efficient sequence of recipes for AArch64. I am making this patch as a draft for the following reasons: - I have not yet been able to create sufficient test cases for this patch. - I have not yet been able to confirm that there are no performance degradations.
1 parent 65c7d0b commit 269e53a

File tree

11 files changed

+270
-33
lines changed

11 files changed

+270
-33
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -852,6 +852,11 @@ class TargetTransformInfo {
852852
/// Return true if the target supports strided load.
853853
LLVM_ABI bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const;
854854

855+
/// Return true if the target benefits from the generation of a more
856+
/// efficient instruction sequence for strided accesses.
857+
LLVM_ABI bool preferToUseStrideRecipesForVectorization(Type *DataType,
858+
Align Alignment) const;
859+
855860
/// Return true is the target supports interleaved access for the given vector
856861
/// type \p VTy, interleave factor \p Factor, alignment \p Alignment and
857862
/// address space \p AddrSpace.

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,11 @@ class TargetTransformInfoImplBase {
374374
return false;
375375
}
376376

377+
virtual bool preferToUseStrideRecipesForVectorization(Type *DataType,
378+
Align Alignment) const {
379+
return false;
380+
}
381+
377382
virtual bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
378383
Align Alignment,
379384
unsigned AddrSpace) const {

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,11 @@ bool TargetTransformInfo::isLegalStridedLoadStore(Type *DataType,
532532
return TTIImpl->isLegalStridedLoadStore(DataType, Alignment);
533533
}
534534

535+
bool TargetTransformInfo::preferToUseStrideRecipesForVectorization(
536+
Type *DataType, Align Alignment) const {
537+
return TTIImpl->preferToUseStrideRecipesForVectorization(DataType, Alignment);
538+
}
539+
535540
bool TargetTransformInfo::isLegalInterleavedAccessType(
536541
VectorType *VTy, unsigned Factor, Align Alignment,
537542
unsigned AddrSpace) const {

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,12 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
346346
return isLegalMaskedGatherScatter(DataType);
347347
}
348348

349+
bool
350+
preferToUseStrideRecipesForVectorization(Type *DataType,
351+
Align Alignment) const override {
352+
return isLegalMaskedGatherScatter(DataType);
353+
}
354+
349355
bool isLegalBroadcastLoad(Type *ElementTy,
350356
ElementCount NumElements) const override {
351357
// Return true if we can generate a `ld1r` splat load instruction.

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8627,6 +8627,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
86278627
VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan,
86288628
CostCtx, Range);
86298629

8630+
VPlanTransforms::runPass(VPlanTransforms::legalizeStridedAccess, *Plan,
8631+
CostCtx, Range);
8632+
86308633
for (ElementCount VF : Range)
86318634
Plan->addVF(VF);
86328635
Plan->setName("Initial VPlan");

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 65 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
//===----------------------------------------------------------------------===//
1313

1414
#include "VPlanTransforms.h"
15+
#include "LoopVectorizationPlanner.h"
1516
#include "VPRecipeBuilder.h"
1617
#include "VPlan.h"
1718
#include "VPlanAnalysis.h"
@@ -20,6 +21,7 @@
2021
#include "VPlanHelpers.h"
2122
#include "VPlanPatternMatch.h"
2223
#include "VPlanUtils.h"
24+
#include "VPlanValue.h"
2325
#include "VPlanVerifier.h"
2426
#include "llvm/ADT/APInt.h"
2527
#include "llvm/ADT/PostOrderIterator.h"
@@ -31,11 +33,17 @@
3133
#include "llvm/Analysis/LoopInfo.h"
3234
#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
3335
#include "llvm/Analysis/VectorUtils.h"
36+
#include "llvm/IR/Constants.h"
37+
#include "llvm/IR/Instruction.h"
3438
#include "llvm/IR/Intrinsics.h"
3539
#include "llvm/IR/MDBuilder.h"
3640
#include "llvm/Support/Casting.h"
41+
#include "llvm/Support/MathExtras.h"
3742
#include "llvm/Support/TypeSize.h"
3843
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
44+
#include <cstdint>
45+
#include <limits>
46+
#include <optional>
3947

4048
using namespace llvm;
4149
using namespace VPlanPatternMatch;
@@ -4336,14 +4344,20 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
43364344
auto IsProfitable = [&](ElementCount VF) -> bool {
43374345
Type *DataTy = toVectorTy(getLoadStoreType(&Ingredient), VF);
43384346
const Align Alignment = getLoadStoreAlignment(&Ingredient);
4339-
if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
4340-
return false;
4341-
const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx);
4342-
const InstructionCost StridedLoadStoreCost =
4343-
Ctx.TTI.getStridedMemoryOpCost(Instruction::Load, DataTy, PtrUV,
4344-
MemR->isMasked(), Alignment,
4345-
Ctx.CostKind, &Ingredient);
4346-
return StridedLoadStoreCost < CurrentCost;
4347+
if (Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment)) {
4348+
const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx);
4349+
const InstructionCost StridedLoadStoreCost =
4350+
Ctx.TTI.getStridedMemoryOpCost(Instruction::Load, DataTy, PtrUV,
4351+
MemR->isMasked(), Alignment,
4352+
Ctx.CostKind, &Ingredient);
4353+
return StridedLoadStoreCost < CurrentCost;
4354+
}
4355+
4356+
if (Ctx.TTI.preferToUseStrideRecipesForVectorization(DataTy,
4357+
Alignment)) {
4358+
return true;
4359+
}
4360+
return false;
43474361
};
43484362

43494363
if (!LoopVectorizationPlanner::getDecisionAndClampRange(IsProfitable,
@@ -4393,3 +4407,46 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
43934407
for (auto *V : PossiblyDead)
43944408
recursivelyDeleteDeadRecipes(V);
43954409
}
4410+
4411+
void VPlanTransforms::legalizeStridedAccess(VPlan &Plan, VPCostContext &Ctx,
4412+
VFRange &Range) {
4413+
VPTypeAnalysis TypeInfo(Plan);
4414+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4415+
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
4416+
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4417+
auto *StrideR = dyn_cast<VPWidenStridedLoadRecipe>(&R);
4418+
if (!StrideR)
4419+
continue;
4420+
4421+
Instruction &Ingredient = StrideR->getIngredient();
4422+
auto NeedsLegalize = [&](ElementCount VF) -> bool {
4423+
Type *DataTy = toVectorTy(getLoadStoreType(&Ingredient), VF);
4424+
const Align Alignment = getLoadStoreAlignment(&Ingredient);
4425+
if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
4426+
return true;
4427+
return false;
4428+
};
4429+
4430+
if (!LoopVectorizationPlanner::getDecisionAndClampRange(NeedsLegalize,
4431+
Range))
4432+
continue;
4433+
4434+
auto *Ptr = cast<VPVectorPointerRecipe>(StrideR->getAddr());
4435+
auto *Stride = StrideR->getStride();
4436+
Type *StrideTy = TypeInfo.inferScalarType(Stride);
4437+
4438+
VPBuilder Builder(StrideR);
4439+
auto *Step =
4440+
Builder.createNaryOp(VPInstruction::StepVector, {}, StrideTy);
4441+
VPValue *Offset = Builder.createNaryOp(Instruction::Mul, {Step, Stride});
4442+
VPValue *GEP = Builder.createWidePtrAdd(Ptr->getOperand(0), Offset);
4443+
4444+
auto *LoadR = new VPWidenLoadRecipe(*cast<LoadInst>(&Ingredient), GEP,
4445+
StrideR->getMask(), false, false,
4446+
*StrideR, StrideR->getDebugLoc());
4447+
Builder.insert(LoadR);
4448+
StrideR->replaceAllUsesWith(LoadR);
4449+
StrideR->eraseFromParent();
4450+
}
4451+
}
4452+
}

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,11 @@ struct VPlanTransforms {
252252
static void convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
253253
VFRange &Range);
254254

255+
/// Legalize strided access recipes for targets that do not support
256+
/// them natively.
257+
static void legalizeStridedAccess(VPlan &Plan, VPCostContext &Ctx,
258+
VFRange &Range);
259+
255260
/// Remove dead recipes from \p Plan.
256261
static void removeDeadRecipes(VPlan &Plan);
257262

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -passes=loop-vectorize,dce,instcombine -mtriple aarch64-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -S %s | FileCheck %s
3+
4+
; Test case with strided access (fixed 80-byte stride)
5+
6+
; void constant_stride_i64(double* a, double* b, int n) {
7+
; for (int i = 0; i < n; i++) {
8+
; a[i] = b[i * 10] + 1;
9+
; }
10+
; }
11+
12+
13+
define void @constant_stride_i64(ptr noalias nocapture writeonly %a, ptr noalias nocapture readonly %b, i64 %n) #0 {
14+
; CHECK-LABEL: @constant_stride_i64(
15+
; CHECK-NEXT: entry:
16+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
17+
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
18+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
19+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
20+
; CHECK: vector.ph:
21+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
22+
; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
23+
; CHECK-NEXT: [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP3]]
24+
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
25+
; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
26+
; CHECK-NEXT: [[TMP5:%.*]] = mul <vscale x 4 x i64> [[TMP4]], splat (i64 80)
27+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
28+
; CHECK: vector.body:
29+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
30+
; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw i64 [[INDEX]], 80
31+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[B:%.*]], i64 [[TMP6]]
32+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], <vscale x 4 x i64> [[TMP5]]
33+
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 8, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i64> poison)
34+
; CHECK-NEXT: [[TMP9:%.*]] = add nsw <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], splat (i64 1)
35+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i64, ptr [[A:%.*]], i64 [[INDEX]]
36+
; CHECK-NEXT: store <vscale x 4 x i64> [[TMP9]], ptr [[TMP10]], align 8
37+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
38+
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
39+
; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
40+
; CHECK: middle.block:
41+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
42+
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
43+
; CHECK: scalar.ph:
44+
;
45+
entry:
46+
br label %for.body
47+
48+
for.body:
49+
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
50+
%arrayidx.idx = mul nuw nsw i64 %indvars.iv, 80
51+
%arrayidx = getelementptr inbounds nuw i8, ptr %b, i64 %arrayidx.idx
52+
%0 = load i64, ptr %arrayidx, align 8
53+
%add = add nsw i64 %0, 1
54+
%arrayidx2 = getelementptr inbounds nuw i64, ptr %a, i64 %indvars.iv
55+
store i64 %add, ptr %arrayidx2, align 8
56+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
57+
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
58+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
59+
60+
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
61+
ret void
62+
}
63+
64+
; Test stride requiring scaling (10 x i64 stride)
65+
66+
; void constant_stride_i64_scaled(double* a, double* b, int n) {
67+
; for (int i = 0; i < n; i++) {
68+
; a[i] = b[i * 10] + 1;
69+
; }
70+
; }
71+
72+
define void @constant_stride_i64_scaled(ptr noalias nocapture writeonly %a, ptr noalias nocapture readonly %b, i64 %n) #0 {
73+
; CHECK-LABEL: @constant_stride_i64_scaled(
74+
; CHECK-NEXT: entry:
75+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
76+
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
77+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
78+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
79+
; CHECK: vector.ph:
80+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
81+
; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
82+
; CHECK-NEXT: [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP3]]
83+
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
84+
; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
85+
; CHECK-NEXT: [[TMP5:%.*]] = mul <vscale x 4 x i64> [[TMP4]], splat (i64 80)
86+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
87+
; CHECK: vector.body:
88+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
89+
; CHECK-NEXT: [[IDX:%.*]] = mul i64 [[INDEX]], 80
90+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[IDX]]
91+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], <vscale x 4 x i64> [[TMP5]]
92+
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> [[TMP7]], i32 8, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i64> poison)
93+
; CHECK-NEXT: [[TMP8:%.*]] = add nsw <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], splat (i64 1)
94+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[A:%.*]], i64 [[INDEX]]
95+
; CHECK-NEXT: store <vscale x 4 x i64> [[TMP8]], ptr [[TMP9]], align 8
96+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
97+
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
98+
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
99+
; CHECK: middle.block:
100+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
101+
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
102+
; CHECK: scalar.ph:
103+
;
104+
entry:
105+
br label %for.body
106+
107+
for.body:
108+
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
109+
%arrayidx.idx = mul nuw nsw i64 %indvars.iv, 10
110+
%arrayidx = getelementptr i64, ptr %b, i64 %arrayidx.idx
111+
%0 = load i64, ptr %arrayidx, align 8
112+
%add = add nsw i64 %0, 1
113+
%arrayidx2 = getelementptr inbounds nuw i64, ptr %a, i64 %indvars.iv
114+
store i64 %add, ptr %arrayidx2, align 8
115+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
116+
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
117+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
118+
119+
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
120+
ret void
121+
}
122+
123+
attributes #0 = { vscale_range(1, 16) }
124+
125+
!0 = distinct !{!0, !1, !2, !3, !4, !5}
126+
!1 = !{!"llvm.loop.mustprogress"}
127+
!2 = !{!"llvm.loop.vectorize.width", i32 4}
128+
!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
129+
!4 = !{!"llvm.loop.interleave.count", i32 1}
130+
!5 = !{!"llvm.loop.vectorize.enable", i1 true}

llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -101,36 +101,38 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
101101
; CHECK: vector.ph:
102102
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
103103
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
104+
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
105+
; CHECK-NEXT: [[TMP3:%.*]] = shl <vscale x 4 x i64> %2, splat (i64 12)
104106
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
105107
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
106108
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[D:%.*]], i64 0
107109
; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
108-
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
109-
; CHECK-NEXT: [[TMP3:%.*]] = shl <vscale x 4 x i64> [[TMP2]], splat (i64 1)
110+
; CHECK-NEXT: [[TMP4:%.*]] = shl <vscale x 4 x i64> [[TMP2]], splat (i64 1)
110111
; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP0]], 3
111112
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
112113
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
113114
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
114115
; CHECK: vector.body:
115116
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
116-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
117-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr @AB_i16, <vscale x 4 x i64> [[VEC_IND]]
118-
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 2, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
119-
; CHECK-NEXT: [[TMP7:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], splat (i64 1)
120-
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr @AB_i16, <vscale x 4 x i64> [[TMP7]]
121-
; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 2, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
122-
; CHECK-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
123-
; CHECK-NEXT: [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
124-
; CHECK-NEXT: [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
125-
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr @CD, i64 [[DOTIDX]]
126-
; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER1]] to <vscale x 4 x i32>
127-
; CHECK-NEXT: [[TMP12:%.*]] = mul nsw <vscale x 4 x i32> [[BROADCAST_SPLAT3]], [[TMP11]]
128-
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP12]])
129-
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP14]], align 4
117+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
118+
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
119+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr @AB_i16, i64 [[OFFSET_IDX]]
120+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], <vscale x 4 x i64> [[TMP3]]
121+
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP7]], i32 2, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
122+
; CHECK-NEXT: [[TMP8:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], splat (i64 1)
123+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr @AB_i16, <vscale x 4 x i64> [[TMP8]]
124+
; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP9]], i32 2, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
125+
; CHECK-NEXT: [[TMP10:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
126+
; CHECK-NEXT: [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP10]]
127+
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr @CD, i64 [[OFFSET_IDX]]
128+
; CHECK-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER1]] to <vscale x 4 x i32>
129+
; CHECK-NEXT: [[TMP14:%.*]] = mul nsw <vscale x 4 x i32> [[BROADCAST_SPLAT3]], [[TMP13]]
130+
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP14]])
131+
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP12]], align 4
130132
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
131133
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
132-
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
133-
; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
134+
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
135+
; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
134136
; CHECK: middle.block:
135137
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH:%.*]]
136138
; CHECK: scalar.ph:

0 commit comments

Comments
 (0)