@@ -3321,63 +3321,6 @@ defm LDRSW : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
33213321// Pre-fetch.
33223322defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
33233323
3324- def vec_ins_or_scal_vec : PatFrags<(ops node:$src),
3325- [(vector_insert undef, node:$src, (i64 0)),
3326- (scalar_to_vector node:$src)]>;
3327-
3328- // For regular load, we do not have any alignment requirement.
3329- // Thus, it is safe to directly map the vector loads with interesting
3330- // addressing modes.
3331- // FIXME: We could do the same for bitconvert to floating point vectors.
3332- multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
3333- ValueType ScalTy, ValueType VecTy,
3334- Instruction LOADW, Instruction LOADX,
3335- SubRegIndex sub> {
3336- def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
3337- (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
3338- (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
3339- (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
3340- sub)>;
3341-
3342- def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
3343- (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
3344- (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
3345- (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
3346- sub)>;
3347- }
3348-
3349- let AddedComplexity = 10 in {
3350- defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v8i8, LDRBroW, LDRBroX, bsub>;
3351- defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v16i8, LDRBroW, LDRBroX, bsub>;
3352-
3353- defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
3354- defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
3355-
3356- defm : ScalToVecROLoadPat<ro16, load, i32, v4f16, LDRHroW, LDRHroX, hsub>;
3357- defm : ScalToVecROLoadPat<ro16, load, i32, v8f16, LDRHroW, LDRHroX, hsub>;
3358-
3359- defm : ScalToVecROLoadPat<ro32, load, i32, v2i32, LDRSroW, LDRSroX, ssub>;
3360- defm : ScalToVecROLoadPat<ro32, load, i32, v4i32, LDRSroW, LDRSroX, ssub>;
3361-
3362- defm : ScalToVecROLoadPat<ro32, load, f32, v2f32, LDRSroW, LDRSroX, ssub>;
3363- defm : ScalToVecROLoadPat<ro32, load, f32, v4f32, LDRSroW, LDRSroX, ssub>;
3364-
3365- defm : ScalToVecROLoadPat<ro64, load, i64, v2i64, LDRDroW, LDRDroX, dsub>;
3366-
3367- defm : ScalToVecROLoadPat<ro64, load, f64, v2f64, LDRDroW, LDRDroX, dsub>;
3368-
3369-
3370- def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
3371- (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
3372- ro_Wextend64:$extend))))),
3373- (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
3374-
3375- def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
3376- (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
3377- ro_Xextend64:$extend))))),
3378- (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
3379- }
3380-
33813324// Match all load 64 bits width whose type is compatible with FPR64
33823325multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
33833326 Instruction LOADW, Instruction LOADX> {
@@ -3501,42 +3444,6 @@ defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr",
35013444def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
35023445 (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
35033446
3504- // For regular load, we do not have any alignment requirement.
3505- // Thus, it is safe to directly map the vector loads with interesting
3506- // addressing modes.
3507- // FIXME: We could do the same for bitconvert to floating point vectors.
3508- def : Pat <(v8i8 (vec_ins_or_scal_vec (i32
3509- (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
3510- (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
3511- (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
3512- def : Pat <(v16i8 (vec_ins_or_scal_vec (i32
3513- (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
3514- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
3515- (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
3516- def : Pat <(v4i16 (vec_ins_or_scal_vec (i32
3517- (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
3518- (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
3519- (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
3520- def : Pat <(v8i16 (vec_ins_or_scal_vec (i32
3521- (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
3522- (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
3523- (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
3524- def : Pat <(v2i32 (vec_ins_or_scal_vec (i32
3525- (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
3526- (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
3527- (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
3528- def : Pat <(v4i32 (vec_ins_or_scal_vec (i32
3529- (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
3530- (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
3531- (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
3532- def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
3533- (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
3534- (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
3535- def : Pat <(v2i64 (vec_ins_or_scal_vec (i64
3536- (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
3537- (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
3538- (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
3539-
35403447// Match all load 64 bits width whose type is compatible with FPR64
35413448let Predicates = [IsLE] in {
35423449 // We must use LD1 to perform vector loads in big-endian.
@@ -3902,12 +3809,13 @@ def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
39023809def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
39033810 (LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
39043811
3905- // A LDR will implicitly zero the rest of the vector, so vector_insert(zeros,
3906- // load, 0) can use a single load.
3907- multiclass LoadInsertZeroPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType HVT, ValueType SVT,
3908- ValueType ScalarVT, Instruction LoadInst, Instruction UnscaledLoadInst,
3909- ComplexPattern Addr, ComplexPattern UnscaledAddr, Operand AddrImm,
3910- SubRegIndex SubReg> {
3812+ // A LDR will implicitly zero the rest of the vector, so vector_insert(zeros, load, 0)
3813+ // can use a single load. Same for scalar_to_vector(load) or insert(undef, load, 0).
3814+ multiclass LoadInsertVTPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType ScalarVT,
3815+ Instruction LoadInst, Instruction UnscaledLoadInst,
3816+ Instruction ROWLoadInst, Instruction ROXLoadInst,
3817+ ROAddrMode ro, ComplexPattern Addr, ComplexPattern UnscaledAddr,
3818+ Operand AddrImm, SubRegIndex SubReg> {
39113819 // Scaled
39123820 def : Pat <(vector_insert (VT immAllZerosV),
39133821 (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
@@ -3916,42 +3824,82 @@ multiclass LoadInsertZeroPatterns<SDPatternOperator LoadOp, ValueType VT, ValueT
39163824 def : Pat <(vector_insert (VT immAllZerosV),
39173825 (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
39183826 (SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
3827+ // roW
3828+ def : Pat <(vector_insert (VT immAllZerosV),
3829+ (ScalarVT (LoadOp (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))), (i64 0)),
3830+ (SUBREG_TO_REG (i64 0), (ROWLoadInst GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), SubReg)>;
3831+ // roX
3832+ def : Pat <(vector_insert (VT immAllZerosV),
3833+ (ScalarVT (LoadOp (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))), (i64 0)),
3834+ (SUBREG_TO_REG (i64 0), (ROXLoadInst GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), SubReg)>;
39193835
3920- // Half-vector patterns
3921- def : Pat <(vector_insert (HVT immAllZerosV),
3922- (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
3923- (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
3924- // Unscaled
3925- def : Pat <(vector_insert (HVT immAllZerosV),
3926- (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
3927- (SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
3928-
3929- // SVE patterns
3930- def : Pat <(vector_insert (SVT immAllZerosV),
3931- (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
3932- (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
3933- // Unscaled
3934- def : Pat <(vector_insert (SVT immAllZerosV),
3935- (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
3836+ // Undef equivalents of the patterns above.
3837+ def : Pat <(VT (vec_ins_or_scal_vec
3838+ (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))))),
3839+ (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
3840+ def : Pat <(VT (vec_ins_or_scal_vec
3841+ (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))))),
39363842 (SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
3937- }
3938-
3939- defm : LoadInsertZeroPatterns<extloadi8, v16i8, v8i8, nxv16i8, i32, LDRBui, LDURBi,
3940- am_indexed8, am_unscaled8, uimm12s1, bsub>;
3941- defm : LoadInsertZeroPatterns<extloadi16, v8i16, v4i16, nxv8i16, i32, LDRHui, LDURHi,
3942- am_indexed16, am_unscaled16, uimm12s2, hsub>;
3943- defm : LoadInsertZeroPatterns<load, v4i32, v2i32, nxv4i32, i32, LDRSui, LDURSi,
3944- am_indexed32, am_unscaled32, uimm12s4, ssub>;
3945- defm : LoadInsertZeroPatterns<load, v2i64, v1i64, nxv2i64, i64, LDRDui, LDURDi,
3946- am_indexed64, am_unscaled64, uimm12s8, dsub>;
3947- defm : LoadInsertZeroPatterns<load, v8f16, v4f16, nxv8f16, f16, LDRHui, LDURHi,
3948- am_indexed16, am_unscaled16, uimm12s2, hsub>;
3949- defm : LoadInsertZeroPatterns<load, v8bf16, v4bf16, nxv8bf16, bf16, LDRHui, LDURHi,
3950- am_indexed16, am_unscaled16, uimm12s2, hsub>;
3951- defm : LoadInsertZeroPatterns<load, v4f32, v2f32, nxv4f32, f32, LDRSui, LDURSi,
3952- am_indexed32, am_unscaled32, uimm12s4, ssub>;
3953- defm : LoadInsertZeroPatterns<load, v2f64, v1f64, nxv2f64, f64, LDRDui, LDURDi,
3954- am_indexed64, am_unscaled64, uimm12s8, dsub>;
3843+ def : Pat <(VT (vec_ins_or_scal_vec
3844+ (ScalarVT (LoadOp (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))))),
3845+ (SUBREG_TO_REG (i64 0), (ROWLoadInst GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), SubReg)>;
3846+ def : Pat <(VT (vec_ins_or_scal_vec
3847+ (ScalarVT (LoadOp (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))))),
3848+ (SUBREG_TO_REG (i64 0), (ROXLoadInst GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), SubReg)>;
3849+ }
3850+
3851+ multiclass LoadInsertPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType HVT, ValueType SVT,
3852+ ValueType ScalarVT, Instruction LoadInst, Instruction UnscaledLoadInst,
3853+ Instruction ROWLoadInst, Instruction ROXLoadInst,
3854+ ROAddrMode ro, ComplexPattern Addr, ComplexPattern UnscaledAddr,
3855+ Operand AddrImm, SubRegIndex SubReg> {
3856+ defm : LoadInsertVTPatterns<LoadOp, VT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
3857+ ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
3858+ defm : LoadInsertVTPatterns<LoadOp, HVT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
3859+ ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
3860+ defm : LoadInsertVTPatterns<LoadOp, SVT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
3861+ ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
3862+ }
3863+
3864+ defm : LoadInsertPatterns<extloadi8, v16i8, v8i8, nxv16i8, i32,
3865+ LDRBui, LDURBi, LDRBroW, LDRBroX,
3866+ ro8, am_indexed8, am_unscaled8, uimm12s1, bsub>;
3867+ defm : LoadInsertPatterns<extloadi16, v8i16, v4i16, nxv8i16, i32,
3868+ LDRHui, LDURHi, LDRHroW, LDRHroX,
3869+ ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
3870+ defm : LoadInsertPatterns<load, v4i32, v2i32, nxv4i32, i32,
3871+ LDRSui, LDURSi, LDRSroW, LDRSroX,
3872+ ro32, am_indexed32, am_unscaled32, uimm12s4, ssub>;
3873+ defm : LoadInsertPatterns<load, v2i64, isVoid, nxv2i64, i64,
3874+ LDRDui, LDURDi, LDRDroW, LDRDroX,
3875+ ro64, am_indexed64, am_unscaled64, uimm12s8, dsub>;
3876+ defm : LoadInsertPatterns<load, v8f16, v4f16, nxv8f16, f16,
3877+ LDRHui, LDURHi, LDRHroW, LDRHroX,
3878+ ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
3879+ defm : LoadInsertPatterns<load, v8bf16, v4bf16, nxv8bf16, bf16,
3880+ LDRHui, LDURHi, LDRHroW, LDRHroX,
3881+ ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
3882+ defm : LoadInsertPatterns<load, v4f32, v2f32, nxv4f32, f32,
3883+ LDRSui, LDURSi, LDRSroW, LDRSroX,
3884+ ro32, am_indexed32, am_unscaled32, uimm12s4, ssub>;
3885+ defm : LoadInsertPatterns<load, v2f64, isVoid, nxv2f64, f64,
3886+ LDRDui, LDURDi, LDRDroW, LDRDroX,
3887+ ro64, am_indexed64, am_unscaled64, uimm12s8, dsub>;
3888+
3889+ // Extra patterns for v1f64 scalar_to_vector(load), which need to avoid the
3890+ // SUBREG_TO_REG used above.
3891+ def : Pat <(v1i64 (scalar_to_vector (i64
3892+ (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
3893+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
3894+ def : Pat <(v1i64 (scalar_to_vector (i64
3895+ (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))))),
3896+ (LDURDi GPR64sp:$Rn, simm9:$offset)>;
3897+ def : Pat <(v1i64 (scalar_to_vector (i64
3898+ (load (ro64.Wpat GPR64sp:$Rn, GPR32:$Rm, ro64.Wext:$extend))))),
3899+ (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro64.Wext:$extend)>;
3900+ def : Pat <(v1i64 (scalar_to_vector (i64
3901+ (load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))),
3902+ (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>;
39553903
39563904// Pre-fetch.
39573905defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
0 commit comments