Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions clang/include/clang/Basic/arm_mve_defs.td
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,9 @@ def immshr: CGHelperFn<"MVEImmediateShr"> {
let special_params = [IRBuilderIntParam<1, "unsigned">,
IRBuilderIntParam<2, "bool">];
}
def fadd: IRBuilder<"CreateFAdd">;
def fmul: IRBuilder<"CreateFMul">;
def fsub: IRBuilder<"CreateFSub">;
def fadd_node: IRBuilder<"CreateFAdd">;
def fmul_node: IRBuilder<"CreateFMul">;
def fsub_node: IRBuilder<"CreateFSub">;
def load: IRBuilder<"CreateLoad"> {
let special_params = [IRBuilderAddrParam<0>];
}
Expand Down Expand Up @@ -212,6 +212,13 @@ def unsignedflag;
// constant giving its size in bits.
def bitsize;

// strictFPAlt allows a node to have different code generation under strict-fp.
// TODO: The standard node can be IRBuilderBase or IRIntBase.
class strictFPAlt<IRBuilderBase standard_, IRIntBase strictfp_> {
IRBuilderBase standard = standard_;
IRIntBase strictfp = strictfp_;
}

// If you put CustomCodegen<"foo"> in an intrinsic's codegen field, it
// indicates that the IR generation for that intrinsic is done by handwritten
// C++ and not autogenerated at all. The effect in the MVE builtin codegen
Expand Down Expand Up @@ -573,6 +580,14 @@ multiclass IntrinsicMXNameOverride<Type rettype, dag arguments, dag cg,
}
}

// StrictFP nodes that choose between standard fadd and llvm.arm.mve.fadd nodes
// depending on whether we are using strict-fp.
def fadd: strictFPAlt<fadd_node,
IRInt<"vadd", [Vector]>>;
def fsub: strictFPAlt<fsub_node,
IRInt<"vsub", [Vector]>>;
def fmul: strictFPAlt<fmul_node,
IRInt<"vmul", [Vector]>>;

// -----------------------------------------------------------------------------
// Convenience lists of parameter types. 'T' is just a container record, so you
Expand Down
214 changes: 146 additions & 68 deletions clang/test/CodeGen/arm-mve-intrinsics/vaddq.c

Large diffs are not rendered by default.

390 changes: 266 additions & 124 deletions clang/test/CodeGen/arm-mve-intrinsics/vmulq.c

Large diffs are not rendered by default.

214 changes: 146 additions & 68 deletions clang/test/CodeGen/arm-mve-intrinsics/vsubq.c

Large diffs are not rendered by default.

36 changes: 34 additions & 2 deletions clang/utils/TableGen/MveEmitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -757,6 +757,26 @@ class IRIntrinsicResult : public Result {
}
};

// Result subclass that generates
// Builder.getIsFPConstrained() ? <Standard> : <StrictFp>
class StrictFpAltResult : public Result {
public:
Ptr Standard;
Ptr StrictFp;
StrictFpAltResult(Ptr Standard, Ptr StrictFp)
: Standard(Standard), StrictFp(StrictFp) {}
void genCode(raw_ostream &OS,
CodeGenParamAllocator &ParamAlloc) const override {
OS << "!Builder.getIsFPConstrained() ? ";
Standard->genCode(OS, ParamAlloc);
OS << " : ";
StrictFp->genCode(OS, ParamAlloc);
}
void morePrerequisites(std::vector<Ptr> &output) const override {
Standard->morePrerequisites(output);
}
};

// Result subclass that specifies a type, for use in IRBuilder operations such
// as CreateBitCast that take a type argument.
class TypeResult : public Result {
Expand Down Expand Up @@ -1239,7 +1259,8 @@ Result::Ptr EmitterBase::getCodeForDag(const DagInit *D,
std::vector<Result::Ptr> Args;
for (unsigned i = 0, e = D->getNumArgs(); i < e; ++i)
Args.push_back(getCodeForDagArg(D, i, Scope, Param));
if (Op->isSubClassOf("IRBuilderBase")) {

auto GenIRBuilderBase = [&](const Record *Op) {
std::set<unsigned> AddressArgs;
std::map<unsigned, std::string> IntegerArgs;
for (const Record *sp : Op->getValueAsListOfDefs("special_params")) {
Expand All @@ -1252,14 +1273,25 @@ Result::Ptr EmitterBase::getCodeForDag(const DagInit *D,
}
return std::make_shared<IRBuilderResult>(Op->getValueAsString("prefix"),
Args, AddressArgs, IntegerArgs);
} else if (Op->isSubClassOf("IRIntBase")) {
};
auto GenIRIntBase = [&](const Record *Op) {
std::vector<const Type *> ParamTypes;
for (const Record *RParam : Op->getValueAsListOfDefs("params"))
ParamTypes.push_back(getType(RParam, Param));
std::string IntName = std::string(Op->getValueAsString("intname"));
if (Op->getValueAsBit("appendKind"))
IntName += "_" + toLetter(cast<ScalarType>(Param)->kind());
return std::make_shared<IRIntrinsicResult>(IntName, ParamTypes, Args);
};

if (Op->isSubClassOf("IRBuilderBase")) {
return GenIRBuilderBase(Op);
} else if (Op->isSubClassOf("IRIntBase")) {
return GenIRIntBase(Op);
} else if (Op->isSubClassOf("strictFPAlt")) {
auto Standard = GenIRBuilderBase(Op->getValueAsDef("standard"));
auto StrictFp = GenIRIntBase(Op->getValueAsDef("strictfp"));
return std::make_shared<StrictFpAltResult>(Standard, StrictFp);
} else {
PrintFatalError("Unsupported dag node " + Op->getName());
}
Expand Down
12 changes: 12 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsARM.td
Original file line number Diff line number Diff line change
Expand Up @@ -1138,6 +1138,18 @@ def int_arm_mve_vshlc_predicated: DefaultAttrsIntrinsic<
[llvm_i32_ty /* bits shifted out */, llvm_anyvector_ty],
[LLVMMatchType<0>, llvm_i32_ty /* bits shifted in */,
llvm_i32_ty /* shift count */, llvm_anyvector_ty], [IntrNoMem]>;
def int_arm_mve_vadd: DefaultAttrsIntrinsic<
[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem]>;
def int_arm_mve_vsub: DefaultAttrsIntrinsic<
[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem]>;
def int_arm_mve_vmul: DefaultAttrsIntrinsic<
[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem]>;
def int_arm_mve_vmulh: DefaultAttrsIntrinsic<
[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */],
Expand Down
33 changes: 22 additions & 11 deletions llvm/lib/Target/ARM/ARMInstrMVE.td
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,16 @@ multiclass MVE_TwoOpPatternDup<MVEVectorVTInfo VTI, SDPatternOperator Op, Intrin
(VTI.Vec MQPR:$inactive)))>;
}

def vadd : PatFrags<(ops node:$lhs, node:$rhs),
[(fadd node:$lhs, node:$rhs),
(int_arm_mve_vadd node:$lhs, node:$rhs)]>;
def vsub : PatFrags<(ops node:$lhs, node:$rhs),
[(fsub node:$lhs, node:$rhs),
(int_arm_mve_vsub node:$lhs, node:$rhs)]>;
def vmul : PatFrags<(ops node:$lhs, node:$rhs),
[(fmul node:$lhs, node:$rhs),
(int_arm_mve_vmul node:$lhs, node:$rhs)]>;

// --------- Start of base classes for the instructions themselves

class MVE_MI<dag oops, dag iops, InstrItinClass itin, string asm,
Expand Down Expand Up @@ -3605,7 +3615,7 @@ class MVE_VMUL_fp<string iname, string suffix, bits<2> size, list<dag> pattern=[
let validForTailPredication = 1;
}

multiclass MVE_VMULT_fp_m<string iname, MVEVectorVTInfo VTI, SDNode Op,
multiclass MVE_VMULT_fp_m<string iname, MVEVectorVTInfo VTI, SDPatternOperator Op,
Intrinsic PredInt, SDPatternOperator IdentityVec> {
def "" : MVE_VMUL_fp<iname, VTI.Suffix, VTI.Size>;
defvar Inst = !cast<Instruction>(NAME);
Expand All @@ -3616,7 +3626,7 @@ multiclass MVE_VMULT_fp_m<string iname, MVEVectorVTInfo VTI, SDNode Op,
}

multiclass MVE_VMUL_fp_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec>
: MVE_VMULT_fp_m<"vmul", VTI, fmul, int_arm_mve_mul_predicated, IdentityVec>;
: MVE_VMULT_fp_m<"vmul", VTI, vmul, int_arm_mve_mul_predicated, IdentityVec>;

def ARMimmOneF: PatLeaf<(bitconvert (v4f32 (ARMvmovFPImm (i32 112))))>; // 1.0 float
def ARMimmOneH: PatLeaf<(bitconvert (v8i16 (ARMvmovImm (i32 2620))))>; // 1.0 half
Expand Down Expand Up @@ -3740,7 +3750,7 @@ defm MVE_VFMSf32 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v4f32>;
defm MVE_VFMSf16 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v8f16>;

multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
SDNode Op, Intrinsic PredInt, SDPatternOperator IdentityVec> {
SDPatternOperator Op, Intrinsic PredInt, SDPatternOperator IdentityVec> {
def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size, 0, 1, bit_21> {
let validForTailPredication = 1;
}
Expand All @@ -3752,9 +3762,9 @@ multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
}

multiclass MVE_VADD_fp_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec>
: MVE_VADDSUB_fp_m<"vadd", 0, VTI, fadd, int_arm_mve_add_predicated, IdentityVec>;
: MVE_VADDSUB_fp_m<"vadd", 0, VTI, vadd, int_arm_mve_add_predicated, IdentityVec>;
multiclass MVE_VSUB_fp_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec>
: MVE_VADDSUB_fp_m<"vsub", 1, VTI, fsub, int_arm_mve_sub_predicated, IdentityVec>;
: MVE_VADDSUB_fp_m<"vsub", 1, VTI, vsub, int_arm_mve_sub_predicated, IdentityVec>;

def ARMimmMinusZeroF: PatLeaf<(bitconvert (v4i32 (ARMvmovImm (i32 1664))))>; // -0.0 float
def ARMimmMinusZeroH: PatLeaf<(bitconvert (v8i16 (ARMvmovImm (i32 2688))))>; // -0.0 half
Expand Down Expand Up @@ -5391,21 +5401,22 @@ defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m<MVE_v8u16, subnuw, ARMvshruImm>;
defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32, subnuw, ARMvshruImm>;

multiclass MVE_VADDSUB_qr_f<string iname, MVEVectorVTInfo VTI, bit subtract,
SDNode Op, Intrinsic PredInt, SDPatternOperator IdentityVec> {
SDPatternOperator Op, Intrinsic PredInt,
SDPatternOperator IdentityVec> {
def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, subtract, VTI.Size>;
defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? ),
!cast<Instruction>(NAME), IdentityVec>;
}

let Predicates = [HasMVEFloat] in {
defm MVE_VADD_qr_f32 : MVE_VADDSUB_qr_f<"vadd", MVE_v4f32, 0b0, fadd,
defm MVE_VADD_qr_f32 : MVE_VADDSUB_qr_f<"vadd", MVE_v4f32, 0b0, vadd,
int_arm_mve_add_predicated, ARMimmMinusZeroF>;
defm MVE_VADD_qr_f16 : MVE_VADDSUB_qr_f<"vadd", MVE_v8f16, 0b0, fadd,
defm MVE_VADD_qr_f16 : MVE_VADDSUB_qr_f<"vadd", MVE_v8f16, 0b0, vadd,
int_arm_mve_add_predicated, ARMimmMinusZeroH>;

defm MVE_VSUB_qr_f32 : MVE_VADDSUB_qr_f<"vsub", MVE_v4f32, 0b1, fsub,
defm MVE_VSUB_qr_f32 : MVE_VADDSUB_qr_f<"vsub", MVE_v4f32, 0b1, vsub,
int_arm_mve_sub_predicated, ARMimmAllZerosV>;
defm MVE_VSUB_qr_f16 : MVE_VADDSUB_qr_f<"vsub", MVE_v8f16, 0b1, fsub,
defm MVE_VSUB_qr_f16 : MVE_VADDSUB_qr_f<"vsub", MVE_v8f16, 0b1, vsub,
int_arm_mve_sub_predicated, ARMimmAllZerosV>;
}

Expand Down Expand Up @@ -5588,7 +5599,7 @@ defm MVE_VQRDMULH_qr_s32 : MVE_VQRDMULH_qr_m<MVE_v4s32>;
multiclass MVE_VxxMUL_qr_f_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec> {
let validForTailPredication = 1 in
def "" : MVE_VxxMUL_qr<"vmul", VTI.Suffix, VTI.Size{0}, 0b11, VTI.Size>;
defm : MVE_TwoOpPatternDup<VTI, fmul, int_arm_mve_mul_predicated, (? ),
defm : MVE_TwoOpPatternDup<VTI, vmul, int_arm_mve_mul_predicated, (? ),
!cast<Instruction>(NAME), IdentityVec>;
}

Expand Down
143 changes: 143 additions & 0 deletions llvm/test/CodeGen/Thumb2/mve-intrinsics/strict-intrinsics.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -o - %s | FileCheck %s

define arm_aapcs_vfpcc <8 x half> @test_vaddq_f16(<8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: test_vaddq_f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vadd.f16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = tail call <8 x half> @llvm.arm.mve.vadd.v8f16(<8 x half> %a, <8 x half> %b)
ret <8 x half> %0
}

define arm_aapcs_vfpcc <4 x float> @test_vaddq_f32(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: test_vaddq_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vadd.f32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = tail call <4 x float> @llvm.arm.mve.vadd.v4f32(<4 x float> %a, <4 x float> %b)
ret <4 x float> %0
}

define arm_aapcs_vfpcc <8 x half> @test_vsubq_f16(<8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: test_vsubq_f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vsub.f16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = tail call <8 x half> @llvm.arm.mve.vsub.v8f16(<8 x half> %a, <8 x half> %b)
ret <8 x half> %0
}

define arm_aapcs_vfpcc <4 x float> @test_vsubq_f32(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: test_vsubq_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vsub.f32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = tail call <4 x float> @llvm.arm.mve.vsub.v4f32(<4 x float> %a, <4 x float> %b)
ret <4 x float> %0
}

define arm_aapcs_vfpcc <8 x half> @test_vmulq_f16(<8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: test_vmulq_f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.f16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = tail call <8 x half> @llvm.arm.mve.vmul.v8f16(<8 x half> %a, <8 x half> %b)
ret <8 x half> %0
}

define arm_aapcs_vfpcc <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: test_vmulq_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.f32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = tail call <4 x float> @llvm.arm.mve.vmul.v4f32(<4 x float> %a, <4 x float> %b)
ret <4 x float> %0
}




define arm_aapcs_vfpcc <8 x half> @test_vaddq_f16_splat(<8 x half> %a, half %b) {
; CHECK-LABEL: test_vaddq_f16_splat:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.f16 r0, s4
; CHECK-NEXT: vadd.f16 q0, q0, r0
; CHECK-NEXT: bx lr
entry:
%i = insertelement <8 x half> poison, half %b, i32 0
%s = shufflevector <8 x half> %i, <8 x half> poison, <8 x i32> zeroinitializer
%0 = tail call <8 x half> @llvm.arm.mve.vadd.v8f16(<8 x half> %a, <8 x half> %s)
ret <8 x half> %0
}

define arm_aapcs_vfpcc <4 x float> @test_vaddq_f32_splat(<4 x float> %a, float %b) {
; CHECK-LABEL: test_vaddq_f32_splat:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r0, s4
; CHECK-NEXT: vadd.f32 q0, q0, r0
; CHECK-NEXT: bx lr
entry:
%i = insertelement <4 x float> poison, float %b, i32 0
%s = shufflevector <4 x float> %i, <4 x float> poison, <4 x i32> zeroinitializer
%0 = tail call <4 x float> @llvm.arm.mve.vadd.v4f32(<4 x float> %a, <4 x float> %s)
ret <4 x float> %0
}

define arm_aapcs_vfpcc <8 x half> @test_vsubq_f16_splat(<8 x half> %a, half %b) {
; CHECK-LABEL: test_vsubq_f16_splat:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.f16 r0, s4
; CHECK-NEXT: vsub.f16 q0, q0, r0
; CHECK-NEXT: bx lr
entry:
%i = insertelement <8 x half> poison, half %b, i32 0
%s = shufflevector <8 x half> %i, <8 x half> poison, <8 x i32> zeroinitializer
%0 = tail call <8 x half> @llvm.arm.mve.vsub.v8f16(<8 x half> %a, <8 x half> %s)
ret <8 x half> %0
}

define arm_aapcs_vfpcc <4 x float> @test_vsubq_f32_splat(<4 x float> %a, float %b) {
; CHECK-LABEL: test_vsubq_f32_splat:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r0, s4
; CHECK-NEXT: vsub.f32 q0, q0, r0
; CHECK-NEXT: bx lr
entry:
%i = insertelement <4 x float> poison, float %b, i32 0
%s = shufflevector <4 x float> %i, <4 x float> poison, <4 x i32> zeroinitializer
%0 = tail call <4 x float> @llvm.arm.mve.vsub.v4f32(<4 x float> %a, <4 x float> %s)
ret <4 x float> %0
}

define arm_aapcs_vfpcc <8 x half> @test_vmulq_f16_splat(<8 x half> %a, half %b) {
; CHECK-LABEL: test_vmulq_f16_splat:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.f16 r0, s4
; CHECK-NEXT: vmul.f16 q0, q0, r0
; CHECK-NEXT: bx lr
entry:
%i = insertelement <8 x half> poison, half %b, i32 0
%s = shufflevector <8 x half> %i, <8 x half> poison, <8 x i32> zeroinitializer
%0 = tail call <8 x half> @llvm.arm.mve.vmul.v8f16(<8 x half> %a, <8 x half> %s)
ret <8 x half> %0
}

define arm_aapcs_vfpcc <4 x float> @test_vmulq_f32_splat(<4 x float> %a, float %b) {
; CHECK-LABEL: test_vmulq_f32_splat:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r0, s4
; CHECK-NEXT: vmul.f32 q0, q0, r0
; CHECK-NEXT: bx lr
entry:
%i = insertelement <4 x float> poison, float %b, i32 0
%s = shufflevector <4 x float> %i, <4 x float> poison, <4 x i32> zeroinitializer
%0 = tail call <4 x float> @llvm.arm.mve.vmul.v4f32(<4 x float> %a, <4 x float> %s)
ret <4 x float> %0
}
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ entry:

declare <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32>, <4 x i32>, i32) #1

define arm_aapcs_vfpcc <8 x half> @test_vabdq_f32(<8 x half> %a, <8 x half> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vabdq_f32:
define arm_aapcs_vfpcc <8 x half> @test_vabdq_f16(<8 x half> %a, <8 x half> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vabdq_f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vabd.f16 q0, q0, q1
; CHECK-NEXT: bx lr
Expand Down
Loading