Skip to content

Commit d929146

Browse files
[Clang][AArch64] Lower NEON vaddv/vminv/vmaxv builtins to llvm.vector.reduce intrinsics. (llvm#165400)
This is the first step in removing some NEON reduction intrinsics that duplicate the behaviour of their llvm.vector.reduce counterpart. NOTE: The i8/i16 variants differ in that the NEON versions return an i32 result. However, this looks more about making their code generation convenient with SelectionDAG disgarding the extra bits. This is only relevant for the next phase because the Clang usage always truncate their result, making llvm.vector.reduce a drop in replacement.
1 parent 838f643 commit d929146

File tree

3 files changed

+104
-297
lines changed

3 files changed

+104
-297
lines changed

clang/lib/CodeGen/TargetBuiltins/ARM.cpp

Lines changed: 40 additions & 209 deletions
Original file line numberDiff line numberDiff line change
@@ -1193,14 +1193,22 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
11931193
NEONMAP1(vaddlvq_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
11941194
NEONMAP1(vaddlvq_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
11951195
NEONMAP1(vaddv_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
1196-
NEONMAP1(vaddv_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
1197-
NEONMAP1(vaddv_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
1196+
NEONMAP1(vaddv_s16, vector_reduce_add, Add1ArgType),
1197+
NEONMAP1(vaddv_s32, vector_reduce_add, Add1ArgType),
1198+
NEONMAP1(vaddv_s8, vector_reduce_add, Add1ArgType),
1199+
NEONMAP1(vaddv_u16, vector_reduce_add, Add1ArgType),
1200+
NEONMAP1(vaddv_u32, vector_reduce_add, Add1ArgType),
1201+
NEONMAP1(vaddv_u8, vector_reduce_add, Add1ArgType),
11981202
NEONMAP1(vaddvq_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
11991203
NEONMAP1(vaddvq_f64, aarch64_neon_faddv, AddRetType | Add1ArgType),
1200-
NEONMAP1(vaddvq_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
1201-
NEONMAP1(vaddvq_s64, aarch64_neon_saddv, AddRetType | Add1ArgType),
1202-
NEONMAP1(vaddvq_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
1203-
NEONMAP1(vaddvq_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
1204+
NEONMAP1(vaddvq_s16, vector_reduce_add, Add1ArgType),
1205+
NEONMAP1(vaddvq_s32, vector_reduce_add, Add1ArgType),
1206+
NEONMAP1(vaddvq_s64, vector_reduce_add, Add1ArgType),
1207+
NEONMAP1(vaddvq_s8, vector_reduce_add, Add1ArgType),
1208+
NEONMAP1(vaddvq_u16, vector_reduce_add, Add1ArgType),
1209+
NEONMAP1(vaddvq_u32, vector_reduce_add, Add1ArgType),
1210+
NEONMAP1(vaddvq_u64, vector_reduce_add, Add1ArgType),
1211+
NEONMAP1(vaddvq_u8, vector_reduce_add, Add1ArgType),
12041212
NEONMAP1(vcaged_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
12051213
NEONMAP1(vcages_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
12061214
NEONMAP1(vcagtd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
@@ -1243,27 +1251,43 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
12431251
NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
12441252
NEONMAP1(vmaxnmvq_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
12451253
NEONMAP1(vmaxv_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
1246-
NEONMAP1(vmaxv_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
1247-
NEONMAP1(vmaxv_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
1254+
NEONMAP1(vmaxv_s16, vector_reduce_smax, Add1ArgType),
1255+
NEONMAP1(vmaxv_s32, vector_reduce_smax, Add1ArgType),
1256+
NEONMAP1(vmaxv_s8, vector_reduce_smax, Add1ArgType),
1257+
NEONMAP1(vmaxv_u16, vector_reduce_umax, Add1ArgType),
1258+
NEONMAP1(vmaxv_u32, vector_reduce_umax, Add1ArgType),
1259+
NEONMAP1(vmaxv_u8, vector_reduce_umax, Add1ArgType),
12481260
NEONMAP1(vmaxvq_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
12491261
NEONMAP1(vmaxvq_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
1250-
NEONMAP1(vmaxvq_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
1251-
NEONMAP1(vmaxvq_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
1262+
NEONMAP1(vmaxvq_s16, vector_reduce_smax, Add1ArgType),
1263+
NEONMAP1(vmaxvq_s32, vector_reduce_smax, Add1ArgType),
1264+
NEONMAP1(vmaxvq_s8, vector_reduce_smax, Add1ArgType),
1265+
NEONMAP1(vmaxvq_u16, vector_reduce_umax, Add1ArgType),
1266+
NEONMAP1(vmaxvq_u32, vector_reduce_umax, Add1ArgType),
1267+
NEONMAP1(vmaxvq_u8, vector_reduce_umax, Add1ArgType),
12521268
NEONMAP1(vminnmv_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
12531269
NEONMAP1(vminnmvq_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
12541270
NEONMAP1(vminnmvq_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
12551271
NEONMAP1(vminv_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
1256-
NEONMAP1(vminv_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
1257-
NEONMAP1(vminv_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
1272+
NEONMAP1(vminv_s16, vector_reduce_smin, Add1ArgType),
1273+
NEONMAP1(vminv_s32, vector_reduce_smin, Add1ArgType),
1274+
NEONMAP1(vminv_s8, vector_reduce_smin, Add1ArgType),
1275+
NEONMAP1(vminv_u16, vector_reduce_umin, Add1ArgType),
1276+
NEONMAP1(vminv_u32, vector_reduce_umin, Add1ArgType),
1277+
NEONMAP1(vminv_u8, vector_reduce_umin, Add1ArgType),
12581278
NEONMAP1(vminvq_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
12591279
NEONMAP1(vminvq_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
1260-
NEONMAP1(vminvq_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
1261-
NEONMAP1(vminvq_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
1280+
NEONMAP1(vminvq_s16, vector_reduce_smin, Add1ArgType),
1281+
NEONMAP1(vminvq_s32, vector_reduce_smin, Add1ArgType),
1282+
NEONMAP1(vminvq_s8, vector_reduce_smin, Add1ArgType),
1283+
NEONMAP1(vminvq_u16, vector_reduce_umin, Add1ArgType),
1284+
NEONMAP1(vminvq_u32, vector_reduce_umin, Add1ArgType),
1285+
NEONMAP1(vminvq_u8, vector_reduce_umin, Add1ArgType),
12621286
NEONMAP1(vmull_p64, aarch64_neon_pmull64, 0),
12631287
NEONMAP1(vmulxd_f64, aarch64_neon_fmulx, Add1ArgType),
12641288
NEONMAP1(vmulxs_f32, aarch64_neon_fmulx, Add1ArgType),
1265-
NEONMAP1(vpaddd_s64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
1266-
NEONMAP1(vpaddd_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
1289+
NEONMAP1(vpaddd_s64, vector_reduce_add, Add1ArgType),
1290+
NEONMAP1(vpaddd_u64, vector_reduce_add, Add1ArgType),
12671291
NEONMAP1(vpmaxnmqd_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
12681292
NEONMAP1(vpmaxnms_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
12691293
NEONMAP1(vpmaxqd_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
@@ -7067,127 +7091,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
70677091
Int = Intrinsic::bitreverse;
70687092
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrbit");
70697093
}
7070-
case NEON::BI__builtin_neon_vaddv_u8:
7071-
// FIXME: These are handled by the AArch64 scalar code.
7072-
usgn = true;
7073-
[[fallthrough]];
7074-
case NEON::BI__builtin_neon_vaddv_s8: {
7075-
Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
7076-
Ty = Int32Ty;
7077-
VTy = llvm::FixedVectorType::get(Int8Ty, 8);
7078-
llvm::Type *Tys[2] = { Ty, VTy };
7079-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7080-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
7081-
return Builder.CreateTrunc(Ops[0], Int8Ty);
7082-
}
7083-
case NEON::BI__builtin_neon_vaddv_u16:
7084-
usgn = true;
7085-
[[fallthrough]];
7086-
case NEON::BI__builtin_neon_vaddv_s16: {
7087-
Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
7088-
Ty = Int32Ty;
7089-
VTy = llvm::FixedVectorType::get(Int16Ty, 4);
7090-
llvm::Type *Tys[2] = { Ty, VTy };
7091-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7092-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
7093-
return Builder.CreateTrunc(Ops[0], Int16Ty);
7094-
}
7095-
case NEON::BI__builtin_neon_vaddvq_u8:
7096-
usgn = true;
7097-
[[fallthrough]];
7098-
case NEON::BI__builtin_neon_vaddvq_s8: {
7099-
Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
7100-
Ty = Int32Ty;
7101-
VTy = llvm::FixedVectorType::get(Int8Ty, 16);
7102-
llvm::Type *Tys[2] = { Ty, VTy };
7103-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7104-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
7105-
return Builder.CreateTrunc(Ops[0], Int8Ty);
7106-
}
7107-
case NEON::BI__builtin_neon_vaddvq_u16:
7108-
usgn = true;
7109-
[[fallthrough]];
7110-
case NEON::BI__builtin_neon_vaddvq_s16: {
7111-
Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
7112-
Ty = Int32Ty;
7113-
VTy = llvm::FixedVectorType::get(Int16Ty, 8);
7114-
llvm::Type *Tys[2] = { Ty, VTy };
7115-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7116-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
7117-
return Builder.CreateTrunc(Ops[0], Int16Ty);
7118-
}
7119-
case NEON::BI__builtin_neon_vmaxv_u8: {
7120-
Int = Intrinsic::aarch64_neon_umaxv;
7121-
Ty = Int32Ty;
7122-
VTy = llvm::FixedVectorType::get(Int8Ty, 8);
7123-
llvm::Type *Tys[2] = { Ty, VTy };
7124-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7125-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7126-
return Builder.CreateTrunc(Ops[0], Int8Ty);
7127-
}
7128-
case NEON::BI__builtin_neon_vmaxv_u16: {
7129-
Int = Intrinsic::aarch64_neon_umaxv;
7130-
Ty = Int32Ty;
7131-
VTy = llvm::FixedVectorType::get(Int16Ty, 4);
7132-
llvm::Type *Tys[2] = { Ty, VTy };
7133-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7134-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7135-
return Builder.CreateTrunc(Ops[0], Int16Ty);
7136-
}
7137-
case NEON::BI__builtin_neon_vmaxvq_u8: {
7138-
Int = Intrinsic::aarch64_neon_umaxv;
7139-
Ty = Int32Ty;
7140-
VTy = llvm::FixedVectorType::get(Int8Ty, 16);
7141-
llvm::Type *Tys[2] = { Ty, VTy };
7142-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7143-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7144-
return Builder.CreateTrunc(Ops[0], Int8Ty);
7145-
}
7146-
case NEON::BI__builtin_neon_vmaxvq_u16: {
7147-
Int = Intrinsic::aarch64_neon_umaxv;
7148-
Ty = Int32Ty;
7149-
VTy = llvm::FixedVectorType::get(Int16Ty, 8);
7150-
llvm::Type *Tys[2] = { Ty, VTy };
7151-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7152-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7153-
return Builder.CreateTrunc(Ops[0], Int16Ty);
7154-
}
7155-
case NEON::BI__builtin_neon_vmaxv_s8: {
7156-
Int = Intrinsic::aarch64_neon_smaxv;
7157-
Ty = Int32Ty;
7158-
VTy = llvm::FixedVectorType::get(Int8Ty, 8);
7159-
llvm::Type *Tys[2] = { Ty, VTy };
7160-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7161-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7162-
return Builder.CreateTrunc(Ops[0], Int8Ty);
7163-
}
7164-
case NEON::BI__builtin_neon_vmaxv_s16: {
7165-
Int = Intrinsic::aarch64_neon_smaxv;
7166-
Ty = Int32Ty;
7167-
VTy = llvm::FixedVectorType::get(Int16Ty, 4);
7168-
llvm::Type *Tys[2] = { Ty, VTy };
7169-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7170-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7171-
return Builder.CreateTrunc(Ops[0], Int16Ty);
7172-
}
7173-
case NEON::BI__builtin_neon_vmaxvq_s8: {
7174-
Int = Intrinsic::aarch64_neon_smaxv;
7175-
Ty = Int32Ty;
7176-
VTy = llvm::FixedVectorType::get(Int8Ty, 16);
7177-
llvm::Type *Tys[2] = { Ty, VTy };
7178-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7179-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7180-
return Builder.CreateTrunc(Ops[0], Int8Ty);
7181-
}
7182-
case NEON::BI__builtin_neon_vmaxvq_s16: {
7183-
Int = Intrinsic::aarch64_neon_smaxv;
7184-
Ty = Int32Ty;
7185-
VTy = llvm::FixedVectorType::get(Int16Ty, 8);
7186-
llvm::Type *Tys[2] = { Ty, VTy };
7187-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7188-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
7189-
return Builder.CreateTrunc(Ops[0], Int16Ty);
7190-
}
71917094
case NEON::BI__builtin_neon_vmaxv_f16: {
71927095
Int = Intrinsic::aarch64_neon_fmaxv;
71937096
Ty = HalfTy;
@@ -7206,78 +7109,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
72067109
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
72077110
return Builder.CreateTrunc(Ops[0], HalfTy);
72087111
}
7209-
case NEON::BI__builtin_neon_vminv_u8: {
7210-
Int = Intrinsic::aarch64_neon_uminv;
7211-
Ty = Int32Ty;
7212-
VTy = llvm::FixedVectorType::get(Int8Ty, 8);
7213-
llvm::Type *Tys[2] = { Ty, VTy };
7214-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7215-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7216-
return Builder.CreateTrunc(Ops[0], Int8Ty);
7217-
}
7218-
case NEON::BI__builtin_neon_vminv_u16: {
7219-
Int = Intrinsic::aarch64_neon_uminv;
7220-
Ty = Int32Ty;
7221-
VTy = llvm::FixedVectorType::get(Int16Ty, 4);
7222-
llvm::Type *Tys[2] = { Ty, VTy };
7223-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7224-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7225-
return Builder.CreateTrunc(Ops[0], Int16Ty);
7226-
}
7227-
case NEON::BI__builtin_neon_vminvq_u8: {
7228-
Int = Intrinsic::aarch64_neon_uminv;
7229-
Ty = Int32Ty;
7230-
VTy = llvm::FixedVectorType::get(Int8Ty, 16);
7231-
llvm::Type *Tys[2] = { Ty, VTy };
7232-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7233-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7234-
return Builder.CreateTrunc(Ops[0], Int8Ty);
7235-
}
7236-
case NEON::BI__builtin_neon_vminvq_u16: {
7237-
Int = Intrinsic::aarch64_neon_uminv;
7238-
Ty = Int32Ty;
7239-
VTy = llvm::FixedVectorType::get(Int16Ty, 8);
7240-
llvm::Type *Tys[2] = { Ty, VTy };
7241-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7242-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7243-
return Builder.CreateTrunc(Ops[0], Int16Ty);
7244-
}
7245-
case NEON::BI__builtin_neon_vminv_s8: {
7246-
Int = Intrinsic::aarch64_neon_sminv;
7247-
Ty = Int32Ty;
7248-
VTy = llvm::FixedVectorType::get(Int8Ty, 8);
7249-
llvm::Type *Tys[2] = { Ty, VTy };
7250-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7251-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7252-
return Builder.CreateTrunc(Ops[0], Int8Ty);
7253-
}
7254-
case NEON::BI__builtin_neon_vminv_s16: {
7255-
Int = Intrinsic::aarch64_neon_sminv;
7256-
Ty = Int32Ty;
7257-
VTy = llvm::FixedVectorType::get(Int16Ty, 4);
7258-
llvm::Type *Tys[2] = { Ty, VTy };
7259-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7260-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7261-
return Builder.CreateTrunc(Ops[0], Int16Ty);
7262-
}
7263-
case NEON::BI__builtin_neon_vminvq_s8: {
7264-
Int = Intrinsic::aarch64_neon_sminv;
7265-
Ty = Int32Ty;
7266-
VTy = llvm::FixedVectorType::get(Int8Ty, 16);
7267-
llvm::Type *Tys[2] = { Ty, VTy };
7268-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7269-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7270-
return Builder.CreateTrunc(Ops[0], Int8Ty);
7271-
}
7272-
case NEON::BI__builtin_neon_vminvq_s16: {
7273-
Int = Intrinsic::aarch64_neon_sminv;
7274-
Ty = Int32Ty;
7275-
VTy = llvm::FixedVectorType::get(Int16Ty, 8);
7276-
llvm::Type *Tys[2] = { Ty, VTy };
7277-
Ops.push_back(EmitScalarExpr(E->getArg(0)));
7278-
Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
7279-
return Builder.CreateTrunc(Ops[0], Int16Ty);
7280-
}
72817112
case NEON::BI__builtin_neon_vminv_f16: {
72827113
Int = Intrinsic::aarch64_neon_fminv;
72837114
Ty = HalfTy;

0 commit comments

Comments
 (0)