Skip to content

Commit 7bb4ec3

Browse files
committed
fixup! Recognizing masks assembled by AND
1 parent 95f772e commit 7bb4ec3

File tree

2 files changed

+135
-68
lines changed

2 files changed

+135
-68
lines changed

llvm/lib/CodeGen/InterleavedAccessPass.cpp

Lines changed: 61 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -385,25 +385,25 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
385385
replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
386386

387387
Value *Mask = nullptr;
388-
unsigned MaskFactor = Factor;
388+
unsigned GapMaskFactor = Factor;
389389
if (LI) {
390390
LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
391391
} else {
392392
// Check mask operand. Handle both all-true/false and interleaved mask.
393-
std::tie(Mask, MaskFactor) = getMask(getMaskOperand(II), Factor, VecTy);
393+
std::tie(Mask, GapMaskFactor) = getMask(getMaskOperand(II), Factor, VecTy);
394394
if (!Mask)
395395
return false;
396396

397397
LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: "
398398
<< *Load << "\n");
399399
LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor
400-
<< " and mask factor " << MaskFactor << "\n");
400+
<< " and mask factor " << GapMaskFactor << "\n");
401401
}
402402

403403
// Try to create target specific intrinsics to replace the load and
404404
// shuffles.
405405
if (!TLI->lowerInterleavedLoad(cast<Instruction>(Load), Mask, Shuffles,
406-
Indices, Factor, MaskFactor))
406+
Indices, Factor, GapMaskFactor))
407407
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
408408
return !Extracts.empty() || BinOpShuffleChanged;
409409

@@ -540,15 +540,20 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
540540
"number of stored element should be a multiple of Factor");
541541

542542
Value *Mask = nullptr;
543+
unsigned GapMaskFactor = Factor;
543544
if (SI) {
544545
LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
545546
} else {
546547
// Check mask operand. Handle both all-true/false and interleaved mask.
547548
unsigned LaneMaskLen = NumStoredElements / Factor;
548-
std::tie(Mask, std::ignore) = getMask(getMaskOperand(II), Factor,
549-
ElementCount::getFixed(LaneMaskLen));
549+
std::tie(Mask, GapMaskFactor) = getMask(
550+
getMaskOperand(II), Factor, ElementCount::getFixed(LaneMaskLen));
550551
if (!Mask)
551552
return false;
553+
// We shouldn't transform stores even it has a gap mask. And since we might
554+
// already change the IR, we're returning true here.
555+
if (GapMaskFactor != Factor)
556+
return true;
552557

553558
LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: "
554559
<< *Store << "\n");
@@ -565,57 +570,85 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
565570
return true;
566571
}
567572

573+
// A wide mask <1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0> could be used to skip the
574+
// last field in a factor-of-three interleaved store or deinterleaved load (in
575+
// which case LeafMaskLen is 4). Such (wide) mask is also known as gap mask.
576+
// This helper function tries to detect this pattern and return the actual
577+
// factor we're accessing, which is 2 in this example.
578+
static unsigned getGapMaskFactor(const Constant &MaskConst, unsigned Factor,
579+
unsigned LeafMaskLen) {
580+
APInt FactorMask(Factor, 0);
581+
FactorMask.setAllBits();
582+
for (unsigned F = 0U; F < Factor; ++F) {
583+
unsigned Idx;
584+
for (Idx = 0U; Idx < LeafMaskLen; ++Idx) {
585+
Constant *C = MaskConst.getAggregateElement(F + Idx * Factor);
586+
if (!C->isZeroValue())
587+
break;
588+
}
589+
// All mask bits on this field are zero, skipping it.
590+
if (Idx >= LeafMaskLen)
591+
FactorMask.clearBit(F);
592+
}
593+
// We currently only allow gaps in the "trailing" factors / fields. So
594+
// given the original factor being 4, we can skip fields 2 and 3, but we
595+
// cannot only skip fields 1 and 2. If FactorMask does not match such
596+
// pattern, reset it.
597+
if (!FactorMask.isMask())
598+
FactorMask.setAllBits();
599+
600+
return FactorMask.popcount();
601+
}
602+
568603
static std::pair<Value *, unsigned> getMask(Value *WideMask, unsigned Factor,
569604
ElementCount LeafValueEC) {
605+
using namespace PatternMatch;
606+
570607
if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
571608
if (unsigned F = getInterleaveIntrinsicFactor(IMI->getIntrinsicID());
572609
F && F == Factor && llvm::all_equal(IMI->args())) {
573610
return {IMI->getArgOperand(0), Factor};
574611
}
575612
}
576613

614+
// Try to match `and <interleaved mask>, <gap mask>`. The WideMask here is
615+
// expected to be a fixed vector and gap mask should be a constant mask.
616+
Value *AndMaskLHS;
617+
Constant *AndMaskRHS;
618+
if (match(WideMask, m_c_And(m_Value(AndMaskLHS), m_Constant(AndMaskRHS))) &&
619+
LeafValueEC.isFixed()) {
620+
assert(!isa<Constant>(AndMaskLHS) &&
621+
"expect constants to be folded already");
622+
return {getMask(AndMaskLHS, Factor, LeafValueEC).first,
623+
getGapMaskFactor(*AndMaskRHS, Factor, LeafValueEC.getFixedValue())};
624+
}
625+
577626
if (auto *ConstMask = dyn_cast<Constant>(WideMask)) {
578627
if (auto *Splat = ConstMask->getSplatValue())
579628
// All-ones or all-zeros mask.
580629
return {ConstantVector::getSplat(LeafValueEC, Splat), Factor};
581630

582631
if (LeafValueEC.isFixed()) {
583632
unsigned LeafMaskLen = LeafValueEC.getFixedValue();
584-
// First, check if the mask completely skips some of the factors / fields.
585-
APInt FactorMask(Factor, 0);
586-
FactorMask.setAllBits();
587-
for (unsigned F = 0U; F < Factor; ++F) {
588-
unsigned Idx;
589-
for (Idx = 0U; Idx < LeafMaskLen; ++Idx) {
590-
Constant *C = ConstMask->getAggregateElement(F + Idx * Factor);
591-
if (!C->isZeroValue())
592-
break;
593-
}
594-
// All mask bits on this field are zero, skipping it.
595-
if (Idx >= LeafMaskLen)
596-
FactorMask.clearBit(F);
597-
}
598-
// We currently only support skipping "trailing" factors / fields. So
599-
// given the original factor being 4, we can skip fields 2 and 3, but we
600-
// cannot only skip fields 1 and 2. If FactorMask does not match such
601-
// pattern, reset it.
602-
if (!FactorMask.isMask())
603-
FactorMask.setAllBits();
633+
// First, check if we use a gap mask to skip some of the factors / fields.
634+
const unsigned GapMaskFactor =
635+
getGapMaskFactor(*ConstMask, Factor, LeafMaskLen);
636+
assert(GapMaskFactor <= Factor);
604637

605638
SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr);
606639
// If this is a fixed-length constant mask, each lane / leaf has to
607640
// use the same mask. This is done by checking if every group with Factor
608641
// number of elements in the interleaved mask has homogeneous values.
609642
for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) {
610-
if (!FactorMask[Idx % Factor])
643+
if (Idx % Factor >= GapMaskFactor)
611644
continue;
612645
Constant *C = ConstMask->getAggregateElement(Idx);
613646
if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C)
614647
return {nullptr, Factor};
615648
LeafMask[Idx / Factor] = C;
616649
}
617650

618-
return {ConstantVector::get(LeafMask), FactorMask.popcount()};
651+
return {ConstantVector::get(LeafMask), GapMaskFactor};
619652
}
620653
}
621654

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll

Lines changed: 74 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,24 @@ define {<4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %ptr) {
367367
ret {<4 x i32>, <4 x i32>} %res1
368368
}
369369

370+
define {<4 x i32>, <4 x i32>} @vpload_factor3_combined_mask_skip_field(ptr %ptr, <4 x i1> %mask) {
371+
; CHECK-LABEL: vpload_factor3_combined_mask_skip_field:
372+
; CHECK: # %bb.0:
373+
; CHECK-NEXT: li a1, 12
374+
; CHECK-NEXT: vsetivli zero, 6, e32, m1, ta, ma
375+
; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1, v0.t
376+
; CHECK-NEXT: ret
377+
%interleaved.mask = shufflevector <4 x i1> %mask, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
378+
%combined = and <12 x i1> %interleaved.mask, <i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false>
379+
%interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> %combined, i32 12)
380+
; mask = %mask, skip the last field
381+
%v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
382+
%v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
383+
%res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
384+
%res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
385+
ret {<4 x i32>, <4 x i32>} %res1
386+
}
387+
370388
define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) {
371389
; CHECK-LABEL: vpload_factor4:
372390
; CHECK: # %bb.0:
@@ -514,8 +532,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
514532
; RV32-NEXT: li a2, 32
515533
; RV32-NEXT: lui a3, 12
516534
; RV32-NEXT: lui a6, 12291
517-
; RV32-NEXT: lui a7, %hi(.LCPI25_0)
518-
; RV32-NEXT: addi a7, a7, %lo(.LCPI25_0)
535+
; RV32-NEXT: lui a7, %hi(.LCPI26_0)
536+
; RV32-NEXT: addi a7, a7, %lo(.LCPI26_0)
519537
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
520538
; RV32-NEXT: vle32.v v24, (a5)
521539
; RV32-NEXT: vmv.s.x v0, a3
@@ -600,12 +618,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
600618
; RV32-NEXT: addi a1, a1, 16
601619
; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
602620
; RV32-NEXT: lui a7, 49164
603-
; RV32-NEXT: lui a1, %hi(.LCPI25_1)
604-
; RV32-NEXT: addi a1, a1, %lo(.LCPI25_1)
621+
; RV32-NEXT: lui a1, %hi(.LCPI26_1)
622+
; RV32-NEXT: addi a1, a1, %lo(.LCPI26_1)
605623
; RV32-NEXT: lui t2, 3
606624
; RV32-NEXT: lui t1, 196656
607-
; RV32-NEXT: lui a4, %hi(.LCPI25_3)
608-
; RV32-NEXT: addi a4, a4, %lo(.LCPI25_3)
625+
; RV32-NEXT: lui a4, %hi(.LCPI26_3)
626+
; RV32-NEXT: addi a4, a4, %lo(.LCPI26_3)
609627
; RV32-NEXT: lui t0, 786624
610628
; RV32-NEXT: li a5, 48
611629
; RV32-NEXT: lui a6, 768
@@ -784,8 +802,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
784802
; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
785803
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
786804
; RV32-NEXT: vrgatherei16.vv v24, v8, v2
787-
; RV32-NEXT: lui a1, %hi(.LCPI25_2)
788-
; RV32-NEXT: addi a1, a1, %lo(.LCPI25_2)
805+
; RV32-NEXT: lui a1, %hi(.LCPI26_2)
806+
; RV32-NEXT: addi a1, a1, %lo(.LCPI26_2)
789807
; RV32-NEXT: lui a3, 3073
790808
; RV32-NEXT: addi a3, a3, -1024
791809
; RV32-NEXT: vmv.s.x v0, a3
@@ -849,16 +867,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
849867
; RV32-NEXT: vrgatherei16.vv v28, v8, v3
850868
; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
851869
; RV32-NEXT: vmv.v.v v28, v24
852-
; RV32-NEXT: lui a1, %hi(.LCPI25_4)
853-
; RV32-NEXT: addi a1, a1, %lo(.LCPI25_4)
854-
; RV32-NEXT: lui a2, %hi(.LCPI25_5)
855-
; RV32-NEXT: addi a2, a2, %lo(.LCPI25_5)
870+
; RV32-NEXT: lui a1, %hi(.LCPI26_4)
871+
; RV32-NEXT: addi a1, a1, %lo(.LCPI26_4)
872+
; RV32-NEXT: lui a2, %hi(.LCPI26_5)
873+
; RV32-NEXT: addi a2, a2, %lo(.LCPI26_5)
856874
; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
857875
; RV32-NEXT: vle16.v v24, (a2)
858876
; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
859877
; RV32-NEXT: vle16.v v8, (a1)
860-
; RV32-NEXT: lui a1, %hi(.LCPI25_7)
861-
; RV32-NEXT: addi a1, a1, %lo(.LCPI25_7)
878+
; RV32-NEXT: lui a1, %hi(.LCPI26_7)
879+
; RV32-NEXT: addi a1, a1, %lo(.LCPI26_7)
862880
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
863881
; RV32-NEXT: vle16.v v10, (a1)
864882
; RV32-NEXT: csrr a1, vlenb
@@ -886,14 +904,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
886904
; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
887905
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
888906
; RV32-NEXT: vrgatherei16.vv v16, v0, v10
889-
; RV32-NEXT: lui a1, %hi(.LCPI25_6)
890-
; RV32-NEXT: addi a1, a1, %lo(.LCPI25_6)
891-
; RV32-NEXT: lui a2, %hi(.LCPI25_8)
892-
; RV32-NEXT: addi a2, a2, %lo(.LCPI25_8)
907+
; RV32-NEXT: lui a1, %hi(.LCPI26_6)
908+
; RV32-NEXT: addi a1, a1, %lo(.LCPI26_6)
909+
; RV32-NEXT: lui a2, %hi(.LCPI26_8)
910+
; RV32-NEXT: addi a2, a2, %lo(.LCPI26_8)
893911
; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
894912
; RV32-NEXT: vle16.v v4, (a1)
895-
; RV32-NEXT: lui a1, %hi(.LCPI25_9)
896-
; RV32-NEXT: addi a1, a1, %lo(.LCPI25_9)
913+
; RV32-NEXT: lui a1, %hi(.LCPI26_9)
914+
; RV32-NEXT: addi a1, a1, %lo(.LCPI26_9)
897915
; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
898916
; RV32-NEXT: vle16.v v6, (a1)
899917
; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
@@ -980,8 +998,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
980998
; RV64-NEXT: li a4, 128
981999
; RV64-NEXT: lui a1, 1
9821000
; RV64-NEXT: vle64.v v8, (a3)
983-
; RV64-NEXT: lui a3, %hi(.LCPI25_0)
984-
; RV64-NEXT: addi a3, a3, %lo(.LCPI25_0)
1001+
; RV64-NEXT: lui a3, %hi(.LCPI26_0)
1002+
; RV64-NEXT: addi a3, a3, %lo(.LCPI26_0)
9851003
; RV64-NEXT: vmv.s.x v0, a4
9861004
; RV64-NEXT: csrr a4, vlenb
9871005
; RV64-NEXT: li a5, 61
@@ -1169,8 +1187,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
11691187
; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
11701188
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
11711189
; RV64-NEXT: vslideup.vi v12, v16, 1, v0.t
1172-
; RV64-NEXT: lui a2, %hi(.LCPI25_1)
1173-
; RV64-NEXT: addi a2, a2, %lo(.LCPI25_1)
1190+
; RV64-NEXT: lui a2, %hi(.LCPI26_1)
1191+
; RV64-NEXT: addi a2, a2, %lo(.LCPI26_1)
11741192
; RV64-NEXT: li a3, 192
11751193
; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
11761194
; RV64-NEXT: vle16.v v6, (a2)
@@ -1204,8 +1222,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
12041222
; RV64-NEXT: vrgatherei16.vv v24, v16, v6
12051223
; RV64-NEXT: addi a2, sp, 16
12061224
; RV64-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
1207-
; RV64-NEXT: lui a2, %hi(.LCPI25_2)
1208-
; RV64-NEXT: addi a2, a2, %lo(.LCPI25_2)
1225+
; RV64-NEXT: lui a2, %hi(.LCPI26_2)
1226+
; RV64-NEXT: addi a2, a2, %lo(.LCPI26_2)
12091227
; RV64-NEXT: li a3, 1040
12101228
; RV64-NEXT: vmv.s.x v0, a3
12111229
; RV64-NEXT: addi a1, a1, -2016
@@ -1289,12 +1307,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
12891307
; RV64-NEXT: add a1, sp, a1
12901308
; RV64-NEXT: addi a1, a1, 16
12911309
; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
1292-
; RV64-NEXT: lui a1, %hi(.LCPI25_3)
1293-
; RV64-NEXT: addi a1, a1, %lo(.LCPI25_3)
1310+
; RV64-NEXT: lui a1, %hi(.LCPI26_3)
1311+
; RV64-NEXT: addi a1, a1, %lo(.LCPI26_3)
12941312
; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
12951313
; RV64-NEXT: vle16.v v20, (a1)
1296-
; RV64-NEXT: lui a1, %hi(.LCPI25_4)
1297-
; RV64-NEXT: addi a1, a1, %lo(.LCPI25_4)
1314+
; RV64-NEXT: lui a1, %hi(.LCPI26_4)
1315+
; RV64-NEXT: addi a1, a1, %lo(.LCPI26_4)
12981316
; RV64-NEXT: vle16.v v8, (a1)
12991317
; RV64-NEXT: csrr a1, vlenb
13001318
; RV64-NEXT: li a2, 77
@@ -1345,8 +1363,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
13451363
; RV64-NEXT: vl2r.v v8, (a1) # vscale x 16-byte Folded Reload
13461364
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
13471365
; RV64-NEXT: vrgatherei16.vv v0, v16, v8
1348-
; RV64-NEXT: lui a1, %hi(.LCPI25_5)
1349-
; RV64-NEXT: addi a1, a1, %lo(.LCPI25_5)
1366+
; RV64-NEXT: lui a1, %hi(.LCPI26_5)
1367+
; RV64-NEXT: addi a1, a1, %lo(.LCPI26_5)
13501368
; RV64-NEXT: vle16.v v20, (a1)
13511369
; RV64-NEXT: csrr a1, vlenb
13521370
; RV64-NEXT: li a2, 61
@@ -1963,8 +1981,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) {
19631981
; RV32-NEXT: vle32.v v12, (a0), v0.t
19641982
; RV32-NEXT: li a0, 36
19651983
; RV32-NEXT: vmv.s.x v20, a1
1966-
; RV32-NEXT: lui a1, %hi(.LCPI61_0)
1967-
; RV32-NEXT: addi a1, a1, %lo(.LCPI61_0)
1984+
; RV32-NEXT: lui a1, %hi(.LCPI62_0)
1985+
; RV32-NEXT: addi a1, a1, %lo(.LCPI62_0)
19681986
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
19691987
; RV32-NEXT: vle16.v v21, (a1)
19701988
; RV32-NEXT: vcompress.vm v8, v12, v11
@@ -2039,8 +2057,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) {
20392057
; RV32-NEXT: vmv.s.x v10, a0
20402058
; RV32-NEXT: li a0, 146
20412059
; RV32-NEXT: vmv.s.x v11, a0
2042-
; RV32-NEXT: lui a0, %hi(.LCPI62_0)
2043-
; RV32-NEXT: addi a0, a0, %lo(.LCPI62_0)
2060+
; RV32-NEXT: lui a0, %hi(.LCPI63_0)
2061+
; RV32-NEXT: addi a0, a0, %lo(.LCPI63_0)
20442062
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
20452063
; RV32-NEXT: vle16.v v20, (a0)
20462064
; RV32-NEXT: li a0, 36
@@ -2159,7 +2177,6 @@ define {<4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr) {
21592177
; mask = 1111, skip last field
21602178
%v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
21612179
%v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
2162-
%v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
21632180
%res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
21642181
%res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
21652182
ret {<4 x i32>, <4 x i32>} %res1
@@ -2177,7 +2194,24 @@ define {<4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %ptr) {
21772194
; mask = 1010, skip the last field
21782195
%v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
21792196
%v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
2180-
%v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
2197+
%res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
2198+
%res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
2199+
ret {<4 x i32>, <4 x i32>} %res1
2200+
}
2201+
2202+
define {<4 x i32>, <4 x i32>} @maskedload_factor3_combined_mask_skip_field(ptr %ptr, <4 x i1> %mask) {
2203+
; CHECK-LABEL: maskedload_factor3_combined_mask_skip_field:
2204+
; CHECK: # %bb.0:
2205+
; CHECK-NEXT: li a1, 12
2206+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
2207+
; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1, v0.t
2208+
; CHECK-NEXT: ret
2209+
%interleaved.mask = shufflevector <4 x i1> %mask, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
2210+
%combined = and <12 x i1> %interleaved.mask, <i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false>
2211+
%interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> %combined, <12 x i32> poison)
2212+
; mask = %mask, skip the last field
2213+
%v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
2214+
%v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
21812215
%res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
21822216
%res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
21832217
ret {<4 x i32>, <4 x i32>} %res1
@@ -2200,8 +2234,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_invalid_skip_field(
22002234
; RV32-NEXT: vle32.v v12, (a0), v0.t
22012235
; RV32-NEXT: li a0, 36
22022236
; RV32-NEXT: vmv.s.x v20, a1
2203-
; RV32-NEXT: lui a1, %hi(.LCPI68_0)
2204-
; RV32-NEXT: addi a1, a1, %lo(.LCPI68_0)
2237+
; RV32-NEXT: lui a1, %hi(.LCPI70_0)
2238+
; RV32-NEXT: addi a1, a1, %lo(.LCPI70_0)
22052239
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
22062240
; RV32-NEXT: vle16.v v21, (a1)
22072241
; RV32-NEXT: vcompress.vm v8, v12, v11

0 commit comments

Comments
 (0)