Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2650,6 +2650,7 @@ static SDValue lowerBUILD_VECTORAsBroadCastLoad(BuildVectorSDNode *BVOp,
SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
BuildVectorSDNode *Node = cast<BuildVectorSDNode>(Op);
MVT VT = Node->getSimpleValueType(0);
EVT ResTy = Op->getValueType(0);
unsigned NumElts = ResTy.getVectorNumElements();
SDLoc DL(Op);
Expand Down Expand Up @@ -2744,6 +2745,66 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
}

if (!IsConstant) {
// If the BUILD_VECTOR has a repeated pattern, use INSERT_VECTOR_ELT to fill
// the sub-sequence of the vector and then broadcast the sub-sequence.
//
// TODO: If the BUILD_VECTOR contains undef elements, consider falling
// back to use INSERT_VECTOR_ELT to materialize the vector, because it
// generates worse code in some cases. This could be further optimized
// with more consideration.
SmallVector<SDValue> Sequence;
BitVector UndefElements;
if (Node->getRepeatedSequence(Sequence, &UndefElements) &&
UndefElements.count() == 0) {
SDValue Vector = DAG.getUNDEF(ResTy);
SDValue FillVec = Vector;
EVT FillTy = ResTy;

// Using LSX instructions to fill the sub-sequence of 256-bits vector,
// because the high part can be simply treated as undef.
if (Is256Vec) {
FillTy = ResTy.getHalfNumVectorElementsVT(*DAG.getContext());
FillVec = DAG.getExtractSubvector(DL, FillTy, Vector, 0);
}

SDValue Op0 = Sequence[0];
unsigned SeqLen = Sequence.size();
if (!Op0.isUndef())
FillVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, FillTy, Op0);
for (unsigned i = 1; i < SeqLen; ++i) {
SDValue Opi = Sequence[i];
if (Opi.isUndef())
continue;
FillVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, FillTy, FillVec, Opi,
DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
}

unsigned SplatLen = NumElts / SeqLen;
MVT SplatEltTy = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
MVT SplatTy = MVT::getVectorVT(SplatEltTy, SplatLen);

// If size of the sub-sequence is half of a 256-bits vector, bitcast the
// vector to v4i64 type in order to match the pattern of XVREPLVE0Q.
if (SplatEltTy == MVT::i128)
SplatTy = MVT::v4i64;

SDValue SplatVec;
SDValue SrcVec = DAG.getBitcast(
SplatTy,
Is256Vec ? DAG.getInsertSubvector(DL, Vector, FillVec, 0) : FillVec);
if (Is256Vec) {
SplatVec =
DAG.getNode((SplatEltTy == MVT::i128) ? LoongArchISD::XVREPLVE0Q
: LoongArchISD::XVREPLVE0,
DL, SplatTy, SrcVec);
} else {
SplatVec = DAG.getNode(LoongArchISD::VREPLVEI, DL, SplatTy, SrcVec,
DAG.getConstant(0, DL, Subtarget.getGRLenVT()));
}

return DAG.getBitcast(ResTy, SplatVec);
}

// Use INSERT_VECTOR_ELT operations rather than expand to stores.
// The resulting code is the same length as the expansion, but it doesn't
// use memory operations.
Expand Down Expand Up @@ -7108,6 +7169,8 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(VREPLGR2VR)
NODE_NAME_CASE(XVPERMI)
NODE_NAME_CASE(XVPERM)
NODE_NAME_CASE(XVREPLVE0)
NODE_NAME_CASE(XVREPLVE0Q)
NODE_NAME_CASE(VPICK_SEXT_ELT)
NODE_NAME_CASE(VPICK_ZEXT_ELT)
NODE_NAME_CASE(VREPLVE)
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/LoongArch/LoongArchISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ enum NodeType : unsigned {
VREPLGR2VR,
XVPERMI,
XVPERM,
XVREPLVE0,
XVREPLVE0Q,

// Extended vector element extraction
VPICK_SEXT_ELT,
Expand Down
21 changes: 20 additions & 1 deletion llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,14 @@

def SDT_LoongArchXVPERM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
SDTCisVec<2>, SDTCisInt<2>]>;
def SDT_LoongArchXVREPLVE0 : SDTypeProfile<1, 1, [SDTCisVec<0>,
SDTCisSameAs<0, 1>]>;

// Target nodes.
def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_LoongArchV1RUimm>;
def loongarch_xvperm: SDNode<"LoongArchISD::XVPERM", SDT_LoongArchXVPERM>;
def loongarch_xvreplve0: SDNode<"LoongArchISD::XVREPLVE0", SDT_LoongArchXVREPLVE0>;
def loongarch_xvreplve0q: SDNode<"LoongArchISD::XVREPLVE0Q", SDT_LoongArchXVREPLVE0>;
def loongarch_xvmskltz: SDNode<"LoongArchISD::XVMSKLTZ", SDT_LoongArchVMSKCOND>;
def loongarch_xvmskgez: SDNode<"LoongArchISD::XVMSKGEZ", SDT_LoongArchVMSKCOND>;
def loongarch_xvmskeqz: SDNode<"LoongArchISD::XVMSKEQZ", SDT_LoongArchVMSKCOND>;
Expand Down Expand Up @@ -1884,11 +1888,26 @@ def : Pat<(loongarch_xvperm v8i32:$xj, v8i32:$xk),
def : Pat<(loongarch_xvperm v8f32:$xj, v8i32:$xk),
(XVPERM_W v8f32:$xj, v8i32:$xk)>;

// XVREPLVE0_{W/D}
// XVREPLVE0_{B/H/W/D/Q}
def : Pat<(loongarch_xvreplve0 v32i8:$xj),
(XVREPLVE0_B v32i8:$xj)>;
def : Pat<(loongarch_xvreplve0 v16i16:$xj),
(XVREPLVE0_H v16i16:$xj)>;
def : Pat<(loongarch_xvreplve0 v8i32:$xj),
(XVREPLVE0_W v8i32:$xj)>;
def : Pat<(loongarch_xvreplve0 v4i64:$xj),
(XVREPLVE0_D v4i64:$xj)>;
def : Pat<(loongarch_xvreplve0 v8f32:$xj),
(XVREPLVE0_W v8f32:$xj)>;
def : Pat<(loongarch_xvreplve0 v4f64:$xj),
(XVREPLVE0_D v4f64:$xj)>;
def : Pat<(lasxsplatf32 FPR32:$fj),
(XVREPLVE0_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32))>;
def : Pat<(lasxsplatf64 FPR64:$fj),
(XVREPLVE0_D (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64))>;
foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in
def : Pat<(vt (loongarch_xvreplve0q LASX256:$xj)),
(XVREPLVE0_Q LASX256:$xj)>;

// VSTELM
defm : VstelmPat<truncstorei8, v32i8, XVSTELM_B, simm8, uimm5>;
Expand Down
28 changes: 14 additions & 14 deletions llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,11 @@ def loongarch_vmskgez: SDNode<"LoongArchISD::VMSKGEZ", SDT_LoongArchVMSKCOND>;
def loongarch_vmskeqz: SDNode<"LoongArchISD::VMSKEQZ", SDT_LoongArchVMSKCOND>;
def loongarch_vmsknez: SDNode<"LoongArchISD::VMSKNEZ", SDT_LoongArchVMSKCOND>;

def immZExt1 : ImmLeaf<i64, [{return isUInt<1>(Imm);}]>;
def immZExt2 : ImmLeaf<i64, [{return isUInt<2>(Imm);}]>;
def immZExt3 : ImmLeaf<i64, [{return isUInt<3>(Imm);}]>;
def immZExt4 : ImmLeaf<i64, [{return isUInt<4>(Imm);}]>;
def immZExt8 : ImmLeaf<i64, [{return isUInt<8>(Imm);}]>;
def immZExt1 : ImmLeaf<GRLenVT, [{return isUInt<1>(Imm);}]>;
def immZExt2 : ImmLeaf<GRLenVT, [{return isUInt<2>(Imm);}]>;
def immZExt3 : ImmLeaf<GRLenVT, [{return isUInt<3>(Imm);}]>;
def immZExt4 : ImmLeaf<GRLenVT, [{return isUInt<4>(Imm);}]>;
def immZExt8 : ImmLeaf<GRLenVT, [{return isUInt<8>(Imm);}]>;
Comment on lines +85 to +89
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests for la32 passed.


class VecCond<SDPatternOperator OpNode, ValueType TyNode,
RegisterClass RC = LSX128>
Expand Down Expand Up @@ -2024,15 +2024,15 @@ def : Pat<(loongarch_vilvh v4f32:$vj, v4f32:$vk),
def : Pat<(loongarch_vilvh v2f64:$vj, v2f64:$vk),
(VILVH_D v2f64:$vj, v2f64:$vk)>;

// VSHUF4I_{B/H/W}
// VSHUF4I_{B/H/W/D}
def : Pat<(loongarch_vshuf4i v16i8:$vj, immZExt8:$ui8),
(VSHUF4I_B v16i8:$vj, immZExt8:$ui8)>;
def : Pat<(loongarch_vshuf4i v8i16:$vj, immZExt8:$ui8),
(VSHUF4I_H v8i16:$vj, immZExt8:$ui8)>;
(VSHUF4I_H v8i16:$vj, immZExt8:$ui8)>;
def : Pat<(loongarch_vshuf4i v4i32:$vj, immZExt8:$ui8),
(VSHUF4I_W v4i32:$vj, immZExt8:$ui8)>;
(VSHUF4I_W v4i32:$vj, immZExt8:$ui8)>;
def : Pat<(loongarch_vshuf4i v4f32:$vj, immZExt8:$ui8),
(VSHUF4I_W v4f32:$vj, immZExt8:$ui8)>;
(VSHUF4I_W v4f32:$vj, immZExt8:$ui8)>;
def : Pat<(loongarch_vshuf4i_d v2i64:$vj, v2i64:$vk, immZExt8:$ui8),
(VSHUF4I_D v2i64:$vj, v2i64:$vk, immZExt8:$ui8)>;
def : Pat<(loongarch_vshuf4i_d v2f64:$vj, v2f64:$vk, immZExt8:$ui8),
Expand All @@ -2042,15 +2042,15 @@ def : Pat<(loongarch_vshuf4i_d v2f64:$vj, v2f64:$vk, immZExt8:$ui8),
def : Pat<(loongarch_vreplvei v16i8:$vj, immZExt4:$ui4),
(VREPLVEI_B v16i8:$vj, immZExt4:$ui4)>;
def : Pat<(loongarch_vreplvei v8i16:$vj, immZExt3:$ui3),
(VREPLVEI_H v8i16:$vj, immZExt3:$ui3)>;
(VREPLVEI_H v8i16:$vj, immZExt3:$ui3)>;
def : Pat<(loongarch_vreplvei v4i32:$vj, immZExt2:$ui2),
(VREPLVEI_W v4i32:$vj, immZExt2:$ui2)>;
(VREPLVEI_W v4i32:$vj, immZExt2:$ui2)>;
def : Pat<(loongarch_vreplvei v2i64:$vj, immZExt1:$ui1),
(VREPLVEI_D v2i64:$vj, immZExt1:$ui1)>;
(VREPLVEI_D v2i64:$vj, immZExt1:$ui1)>;
def : Pat<(loongarch_vreplvei v4f32:$vj, immZExt2:$ui2),
(VREPLVEI_W v4f32:$vj, immZExt2:$ui2)>;
(VREPLVEI_W v4f32:$vj, immZExt2:$ui2)>;
def : Pat<(loongarch_vreplvei v2f64:$vj, immZExt1:$ui1),
(VREPLVEI_D v2f64:$vj, immZExt1:$ui1)>;
(VREPLVEI_D v2f64:$vj, immZExt1:$ui1)>;

// VREPLVEI_{W/D}
def : Pat<(lsxsplatf32 FPR32:$fj),
Expand Down
44 changes: 12 additions & 32 deletions llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,9 @@ define <4 x i64> @should_not_be_optimized(ptr %ptr, ptr %dst) {
; LA32-NEXT: ld.w $a2, $a0, 0
; LA32-NEXT: ld.w $a0, $a0, 4
; LA32-NEXT: st.w $a2, $a1, 0
; LA32-NEXT: xvinsgr2vr.w $xr0, $a2, 0
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 1
; LA32-NEXT: xvinsgr2vr.w $xr0, $a2, 2
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 3
; LA32-NEXT: xvinsgr2vr.w $xr0, $a2, 4
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 5
; LA32-NEXT: xvinsgr2vr.w $xr0, $a2, 6
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 7
; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0
; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1
; LA32-NEXT: xvreplve0.d $xr0, $xr0
; LA32-NEXT: st.w $a0, $a1, 4
; LA32-NEXT: ret
;
Expand Down Expand Up @@ -64,14 +59,9 @@ define <4 x i64> @xvldrepl_d_unaligned_offset(ptr %ptr) {
; LA32: # %bb.0:
; LA32-NEXT: ld.w $a1, $a0, 4
; LA32-NEXT: ld.w $a0, $a0, 8
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 0
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 1
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 2
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 3
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 4
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 5
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 6
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 7
; LA32-NEXT: vinsgr2vr.w $vr0, $a1, 0
; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1
; LA32-NEXT: xvreplve0.d $xr0, $xr0
; LA32-NEXT: ret
;
; LA64-LABEL: xvldrepl_d_unaligned_offset:
Expand Down Expand Up @@ -162,14 +152,9 @@ define <4 x i64> @xvldrepl_d(ptr %ptr) {
; LA32: # %bb.0:
; LA32-NEXT: ld.w $a1, $a0, 0
; LA32-NEXT: ld.w $a0, $a0, 4
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 0
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 1
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 2
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 3
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 4
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 5
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 6
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 7
; LA32-NEXT: vinsgr2vr.w $vr0, $a1, 0
; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1
; LA32-NEXT: xvreplve0.d $xr0, $xr0
; LA32-NEXT: ret
;
; LA64-LABEL: xvldrepl_d:
Expand All @@ -187,14 +172,9 @@ define <4 x i64> @xvldrepl_d_offset(ptr %ptr) {
; LA32: # %bb.0:
; LA32-NEXT: ld.w $a1, $a0, 264
; LA32-NEXT: ld.w $a0, $a0, 268
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 0
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 1
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 2
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 3
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 4
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 5
; LA32-NEXT: xvinsgr2vr.w $xr0, $a1, 6
; LA32-NEXT: xvinsgr2vr.w $xr0, $a0, 7
; LA32-NEXT: vinsgr2vr.w $vr0, $a1, 0
; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1
; LA32-NEXT: xvreplve0.d $xr0, $xr0
; LA32-NEXT: ret
;
; LA64-LABEL: xvldrepl_d_offset:
Expand Down
Loading