Skip to content

Commit 441f883

Browse files
committed
Improve strided access vectorization for AArch64 SVE
Currently, LLVM vectorizes strided accesses with SVE as follows. ```c void func(double* restrict a, double* b, int n) { for (int i = 0; i < n; i++) { a[i] = b[i * 10] + 1; } } ``` => ``` ... index z1.d, #0, llvm#1 loop: add z2.d, z1.d, z0.d mul z1.d, z1.d, llvm#80 ld1d { z1.d }, p0/z, [x1, z1.d] ... mov z1.d, z2.d ... ``` This generated code is inefficient because it performs address calculation inside the loop using vector instructions. This can lead to performance degradation. Ideally, we want to generate efficient instructions that keep the offset vector `z1` constant and update the base register `x1` with a scalar instruction. ``` ... index z1.d, #0, llvm#10 loop: ld1d z2.d, p7/z, [x1, z1.d, lsl 3] ... add x1, x1, x2 ... ``` This patch enables strided accesses to be vectorized efficiently as shown above. This patch is based on llvm#147297. llvm#147297 detects strided accesses and converts them into stride recipes. This patch then changes it to a legal and efficient sequence of recipes for AArch64. I am making this patch as a draft for the following reasons: - I have not yet been able to create sufficient test cases for this patch. - I have not yet been able to confirm that there are no performance degradations.
1 parent 65c7d0b commit 441f883

File tree

11 files changed

+270
-33
lines changed

11 files changed

+270
-33
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -852,6 +852,11 @@ class TargetTransformInfo {
852852
/// Return true if the target supports strided load.
853853
LLVM_ABI bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const;
854854

855+
/// Return true if the target benefits from the generation of a more
856+
/// efficient instruction sequence for strided accesses.
857+
LLVM_ABI bool preferToUseStrideRecipesForVectorization(Type *DataType,
858+
Align Alignment) const;
859+
855860
/// Return true is the target supports interleaved access for the given vector
856861
/// type \p VTy, interleave factor \p Factor, alignment \p Alignment and
857862
/// address space \p AddrSpace.

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,11 @@ class TargetTransformInfoImplBase {
374374
return false;
375375
}
376376

377+
virtual bool preferToUseStrideRecipesForVectorization(Type *DataType,
378+
Align Alignment) const {
379+
return false;
380+
}
381+
377382
virtual bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
378383
Align Alignment,
379384
unsigned AddrSpace) const {

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,11 @@ bool TargetTransformInfo::isLegalStridedLoadStore(Type *DataType,
532532
return TTIImpl->isLegalStridedLoadStore(DataType, Alignment);
533533
}
534534

535+
bool TargetTransformInfo::preferToUseStrideRecipesForVectorization(
536+
Type *DataType, Align Alignment) const {
537+
return TTIImpl->preferToUseStrideRecipesForVectorization(DataType, Alignment);
538+
}
539+
535540
bool TargetTransformInfo::isLegalInterleavedAccessType(
536541
VectorType *VTy, unsigned Factor, Align Alignment,
537542
unsigned AddrSpace) const {

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,12 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
346346
return isLegalMaskedGatherScatter(DataType);
347347
}
348348

349+
bool
350+
preferToUseStrideRecipesForVectorization(Type *DataType,
351+
Align Alignment) const override {
352+
return isLegalMaskedGatherScatter(DataType);
353+
}
354+
349355
bool isLegalBroadcastLoad(Type *ElementTy,
350356
ElementCount NumElements) const override {
351357
// Return true if we can generate a `ld1r` splat load instruction.

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8627,6 +8627,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
86278627
VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan,
86288628
CostCtx, Range);
86298629

8630+
VPlanTransforms::runPass(VPlanTransforms::legalizeStridedAccess, *Plan,
8631+
CostCtx, Range);
8632+
86308633
for (ElementCount VF : Range)
86318634
Plan->addVF(VF);
86328635
Plan->setName("Initial VPlan");

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 65 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
//===----------------------------------------------------------------------===//
1313

1414
#include "VPlanTransforms.h"
15+
#include "LoopVectorizationPlanner.h"
1516
#include "VPRecipeBuilder.h"
1617
#include "VPlan.h"
1718
#include "VPlanAnalysis.h"
@@ -20,6 +21,7 @@
2021
#include "VPlanHelpers.h"
2122
#include "VPlanPatternMatch.h"
2223
#include "VPlanUtils.h"
24+
#include "VPlanValue.h"
2325
#include "VPlanVerifier.h"
2426
#include "llvm/ADT/APInt.h"
2527
#include "llvm/ADT/PostOrderIterator.h"
@@ -31,11 +33,17 @@
3133
#include "llvm/Analysis/LoopInfo.h"
3234
#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
3335
#include "llvm/Analysis/VectorUtils.h"
36+
#include "llvm/IR/Constants.h"
37+
#include "llvm/IR/Instruction.h"
3438
#include "llvm/IR/Intrinsics.h"
3539
#include "llvm/IR/MDBuilder.h"
3640
#include "llvm/Support/Casting.h"
41+
#include "llvm/Support/MathExtras.h"
3742
#include "llvm/Support/TypeSize.h"
3843
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
44+
#include <cstdint>
45+
#include <limits>
46+
#include <optional>
3947

4048
using namespace llvm;
4149
using namespace VPlanPatternMatch;
@@ -4336,14 +4344,20 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
43364344
auto IsProfitable = [&](ElementCount VF) -> bool {
43374345
Type *DataTy = toVectorTy(getLoadStoreType(&Ingredient), VF);
43384346
const Align Alignment = getLoadStoreAlignment(&Ingredient);
4339-
if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
4340-
return false;
4341-
const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx);
4342-
const InstructionCost StridedLoadStoreCost =
4343-
Ctx.TTI.getStridedMemoryOpCost(Instruction::Load, DataTy, PtrUV,
4344-
MemR->isMasked(), Alignment,
4345-
Ctx.CostKind, &Ingredient);
4346-
return StridedLoadStoreCost < CurrentCost;
4347+
if (Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment)) {
4348+
const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx);
4349+
const InstructionCost StridedLoadStoreCost =
4350+
Ctx.TTI.getStridedMemoryOpCost(Instruction::Load, DataTy, PtrUV,
4351+
MemR->isMasked(), Alignment,
4352+
Ctx.CostKind, &Ingredient);
4353+
return StridedLoadStoreCost < CurrentCost;
4354+
}
4355+
4356+
if (Ctx.TTI.preferToUseStrideRecipesForVectorization(DataTy,
4357+
Alignment)) {
4358+
return true;
4359+
}
4360+
return false;
43474361
};
43484362

43494363
if (!LoopVectorizationPlanner::getDecisionAndClampRange(IsProfitable,
@@ -4393,3 +4407,46 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
43934407
for (auto *V : PossiblyDead)
43944408
recursivelyDeleteDeadRecipes(V);
43954409
}
4410+
4411+
void VPlanTransforms::legalizeStridedAccess(VPlan &Plan, VPCostContext &Ctx,
4412+
VFRange &Range) {
4413+
VPTypeAnalysis TypeInfo(Plan);
4414+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4415+
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
4416+
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4417+
auto *StrideR = dyn_cast<VPWidenStridedLoadRecipe>(&R);
4418+
if (!StrideR)
4419+
continue;
4420+
4421+
Instruction &Ingredient = StrideR->getIngredient();
4422+
auto NeedsLegalize = [&](ElementCount VF) -> bool {
4423+
Type *DataTy = toVectorTy(getLoadStoreType(&Ingredient), VF);
4424+
const Align Alignment = getLoadStoreAlignment(&Ingredient);
4425+
if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
4426+
return true;
4427+
return false;
4428+
};
4429+
4430+
if (!LoopVectorizationPlanner::getDecisionAndClampRange(NeedsLegalize,
4431+
Range))
4432+
continue;
4433+
4434+
auto *Ptr = cast<VPVectorPointerRecipe>(StrideR->getAddr());
4435+
auto *Stride = StrideR->getStride();
4436+
Type *StrideTy = TypeInfo.inferScalarType(Stride);
4437+
4438+
VPBuilder Builder(StrideR);
4439+
auto *Step =
4440+
Builder.createNaryOp(VPInstruction::StepVector, {}, StrideTy);
4441+
VPValue *Offset = Builder.createNaryOp(Instruction::Mul, {Step, Stride});
4442+
VPValue *GEP = Builder.createWidePtrAdd(Ptr->getOperand(0), Offset);
4443+
4444+
auto *LoadR = new VPWidenLoadRecipe(*cast<LoadInst>(&Ingredient), GEP,
4445+
StrideR->getMask(), false, false,
4446+
*StrideR, StrideR->getDebugLoc());
4447+
Builder.insert(LoadR);
4448+
StrideR->replaceAllUsesWith(LoadR);
4449+
StrideR->eraseFromParent();
4450+
}
4451+
}
4452+
}

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,11 @@ struct VPlanTransforms {
252252
static void convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
253253
VFRange &Range);
254254

255+
/// Legalize strided access recipes for targets that do not support
256+
/// them natively.
257+
static void legalizeStridedAccess(VPlan &Plan, VPCostContext &Ctx,
258+
VFRange &Range);
259+
255260
/// Remove dead recipes from \p Plan.
256261
static void removeDeadRecipes(VPlan &Plan);
257262

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -passes=loop-vectorize,dce,instcombine -mtriple aarch64-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -S %s | FileCheck %s
3+
4+
; Test case with strided access (fixed 80-byte stride)
5+
6+
; void constant_stride_i64(double* a, double* b, int n) {
7+
; for (int i = 0; i < n; i++) {
8+
; a[i] = b[i * 10] + 1;
9+
; }
10+
; }
11+
12+
13+
define void @constant_stride_i64(ptr noalias nocapture writeonly %a, ptr noalias nocapture readonly %b, i64 %n) #0 {
14+
; CHECK-LABEL: @constant_stride_i64(
15+
; CHECK-NEXT: entry:
16+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
17+
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
18+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
19+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
20+
; CHECK: vector.ph:
21+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
22+
; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
23+
; CHECK-NEXT: [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP3]]
24+
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
25+
; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
26+
; CHECK-NEXT: [[TMP5:%.*]] = mul <vscale x 4 x i64> [[TMP4]], splat (i64 80)
27+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
28+
; CHECK: vector.body:
29+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
30+
; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw i64 [[INDEX]], 80
31+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[B:%.*]], i64 [[TMP6]]
32+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], <vscale x 4 x i64> [[TMP5]]
33+
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 8, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i64> poison)
34+
; CHECK-NEXT: [[TMP9:%.*]] = add nsw <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], splat (i64 1)
35+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i64, ptr [[A:%.*]], i64 [[INDEX]]
36+
; CHECK-NEXT: store <vscale x 4 x i64> [[TMP9]], ptr [[TMP10]], align 8
37+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
38+
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
39+
; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
40+
; CHECK: middle.block:
41+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
42+
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
43+
; CHECK: scalar.ph:
44+
;
45+
entry:
46+
br label %for.body
47+
48+
for.body:
49+
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
50+
%arrayidx.idx = mul nuw nsw i64 %indvars.iv, 80
51+
%arrayidx = getelementptr inbounds nuw i8, ptr %b, i64 %arrayidx.idx
52+
%0 = load i64, ptr %arrayidx, align 8
53+
%add = add nsw i64 %0, 1
54+
%arrayidx2 = getelementptr inbounds nuw i64, ptr %a, i64 %indvars.iv
55+
store i64 %add, ptr %arrayidx2, align 8
56+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
57+
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
58+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
59+
60+
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
61+
ret void
62+
}
63+
64+
; Test stride requiring scaling (10 x i64 stride)
65+
66+
; void constant_stride_i64_scaled(double* a, double* b, int n) {
67+
; for (int i = 0; i < n; i++) {
68+
; a[i] = b[i * 10] + 1;
69+
; }
70+
; }
71+
72+
define void @constant_stride_i64_scaled(ptr noalias nocapture writeonly %a, ptr noalias nocapture readonly %b, i64 %n) #0 {
73+
; CHECK-LABEL: @constant_stride_i64_scaled(
74+
; CHECK-NEXT: entry:
75+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
76+
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
77+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
78+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
79+
; CHECK: vector.ph:
80+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
81+
; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
82+
; CHECK-NEXT: [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP3]]
83+
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
84+
; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
85+
; CHECK-NEXT: [[TMP5:%.*]] = mul <vscale x 4 x i64> [[TMP4]], splat (i64 80)
86+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
87+
; CHECK: vector.body:
88+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
89+
; CHECK-NEXT: [[IDX:%.*]] = mul i64 [[INDEX]], 80
90+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[IDX]]
91+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], <vscale x 4 x i64> [[TMP5]]
92+
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> [[TMP7]], i32 8, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i64> poison)
93+
; CHECK-NEXT: [[TMP8:%.*]] = add nsw <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], splat (i64 1)
94+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[A:%.*]], i64 [[INDEX]]
95+
; CHECK-NEXT: store <vscale x 4 x i64> [[TMP8]], ptr [[TMP9]], align 8
96+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
97+
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
98+
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
99+
; CHECK: middle.block:
100+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
101+
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
102+
; CHECK: scalar.ph:
103+
;
104+
entry:
105+
br label %for.body
106+
107+
for.body:
108+
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
109+
%arrayidx.idx = mul nuw nsw i64 %indvars.iv, 10
110+
%arrayidx = getelementptr i64, ptr %b, i64 %arrayidx.idx
111+
%0 = load i64, ptr %arrayidx, align 8
112+
%add = add nsw i64 %0, 1
113+
%arrayidx2 = getelementptr inbounds nuw i64, ptr %a, i64 %indvars.iv
114+
store i64 %add, ptr %arrayidx2, align 8
115+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
116+
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
117+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
118+
119+
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
120+
ret void
121+
}
122+
123+
attributes #0 = { vscale_range(1, 16) }
124+
125+
!0 = distinct !{!0, !1, !2, !3, !4, !5}
126+
!1 = !{!"llvm.loop.mustprogress"}
127+
!2 = !{!"llvm.loop.vectorize.width", i32 4}
128+
!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
129+
!4 = !{!"llvm.loop.interleave.count", i32 1}
130+
!5 = !{!"llvm.loop.vectorize.enable", i1 true}

llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -101,36 +101,38 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
101101
; CHECK: vector.ph:
102102
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
103103
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
104+
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
105+
; CHECK-NEXT: [[TMP3:%.*]] = shl <vscale x 4 x i64> %2, splat (i64 12)
104106
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
105107
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
106108
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[D:%.*]], i64 0
107109
; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
108-
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
109-
; CHECK-NEXT: [[TMP3:%.*]] = shl <vscale x 4 x i64> [[TMP2]], splat (i64 1)
110+
; CHECK-NEXT: [[TMP4:%.*]] = shl <vscale x 4 x i64> [[TMP2]], splat (i64 1)
110111
; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP0]], 3
111112
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
112113
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
113114
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
114115
; CHECK: vector.body:
115116
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
116-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
117-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr @AB_i16, <vscale x 4 x i64> [[VEC_IND]]
118-
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 2, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
119-
; CHECK-NEXT: [[TMP7:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], splat (i64 1)
120-
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr @AB_i16, <vscale x 4 x i64> [[TMP7]]
121-
; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 2, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
122-
; CHECK-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
123-
; CHECK-NEXT: [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
124-
; CHECK-NEXT: [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
125-
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr @CD, i64 [[DOTIDX]]
126-
; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER1]] to <vscale x 4 x i32>
127-
; CHECK-NEXT: [[TMP12:%.*]] = mul nsw <vscale x 4 x i32> [[BROADCAST_SPLAT3]], [[TMP11]]
128-
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP12]])
129-
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP14]], align 4
117+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
118+
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
119+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr @AB_i16, i64 [[OFFSET_IDX]]
120+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], <vscale x 4 x i64> [[TMP3]]
121+
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP7]], i32 2, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
122+
; CHECK-NEXT: [[TMP8:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], splat (i64 1)
123+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr @AB_i16, <vscale x 4 x i64> [[TMP8]]
124+
; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP9]], i32 2, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
125+
; CHECK-NEXT: [[TMP10:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
126+
; CHECK-NEXT: [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP10]]
127+
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr @CD, i64 [[OFFSET_IDX]]
128+
; CHECK-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER1]] to <vscale x 4 x i32>
129+
; CHECK-NEXT: [[TMP14:%.*]] = mul nsw <vscale x 4 x i32> [[BROADCAST_SPLAT3]], [[TMP13]]
130+
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP14]])
131+
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP12]], align 4
130132
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
131133
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
132-
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
133-
; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
134+
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
135+
; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
134136
; CHECK: middle.block:
135137
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH:%.*]]
136138
; CHECK: scalar.ph:

0 commit comments

Comments
 (0)