@@ -6759,29 +6759,45 @@ Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
6759
6759
return Builder.CreateCall(F, Ops, name);
6760
6760
}
6761
6761
6762
- Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
6762
+ Value *CodeGenFunction::EmitFP8NeonCall(unsigned IID,
6763
+ ArrayRef<llvm::Type *> Tys,
6763
6764
SmallVectorImpl<Value *> &Ops,
6764
- Value *FPM, const char *name) {
6765
+ const CallExpr *E, const char *name) {
6766
+ llvm::Value *FPM =
6767
+ EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
6765
6768
Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), FPM);
6766
- return EmitNeonCall(F , Ops, name);
6769
+ return EmitNeonCall(CGM.getIntrinsic(IID, Tys) , Ops, name);
6767
6770
}
6768
6771
6769
6772
llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
6770
- unsigned IID, bool ExtendLane , llvm::Type *RetTy,
6773
+ unsigned IID, bool ExtendLaneArg , llvm::Type *RetTy,
6771
6774
SmallVectorImpl<llvm::Value *> &Ops, const CallExpr *E, const char *name) {
6772
6775
6773
6776
const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
6774
6777
RetTy->getPrimitiveSizeInBits();
6775
6778
llvm::Type *Tys[] = {llvm::FixedVectorType::get(RetTy, ElemCount),
6776
6779
Ops[1]->getType()};
6777
- if (ExtendLane ) {
6780
+ if (ExtendLaneArg ) {
6778
6781
auto *VT = llvm::FixedVectorType::get(Int8Ty, 16);
6779
6782
Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2],
6780
6783
Builder.getInt64(0));
6781
6784
}
6782
- llvm::Value *FPM =
6783
- EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
6784
- return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
6785
+ return EmitFP8NeonCall(IID, Tys, Ops, E, name);
6786
+ }
6787
+
6788
+ llvm::Value *CodeGenFunction::EmitFP8NeonFMLACall(
6789
+ unsigned IID, bool ExtendLaneArg, llvm::Type *RetTy,
6790
+ SmallVectorImpl<llvm::Value *> &Ops, const CallExpr *E, const char *name) {
6791
+
6792
+ if (ExtendLaneArg) {
6793
+ auto *VT = llvm::FixedVectorType::get(Int8Ty, 16);
6794
+ Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2],
6795
+ Builder.getInt64(0));
6796
+ }
6797
+ const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
6798
+ RetTy->getPrimitiveSizeInBits();
6799
+ return EmitFP8NeonCall(IID, {llvm::FixedVectorType::get(RetTy, ElemCount)},
6800
+ Ops, E, name);
6785
6801
}
6786
6802
6787
6803
Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
@@ -6802,9 +6818,7 @@ Value *CodeGenFunction::EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
6802
6818
Tys[1] = llvm::FixedVectorType::get(Int8Ty, 8);
6803
6819
Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
6804
6820
}
6805
- llvm::Value *FPM =
6806
- EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
6807
- return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
6821
+ return EmitFP8NeonCall(IID, Tys, Ops, E, name);
6808
6822
}
6809
6823
6810
6824
// Right-shift a vector by a constant.
@@ -12779,7 +12793,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
12779
12793
12780
12794
unsigned Int;
12781
12795
bool ExtractLow = false;
12782
- bool ExtendLane = false;
12796
+ bool ExtendLaneArg = false;
12783
12797
switch (BuiltinID) {
12784
12798
default: return nullptr;
12785
12799
case NEON::BI__builtin_neon_vbsl_v:
@@ -14054,24 +14068,85 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
14054
14068
Ops, E, "fdot2");
14055
14069
case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm:
14056
14070
case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm:
14057
- ExtendLane = true;
14071
+ ExtendLaneArg = true;
14058
14072
LLVM_FALLTHROUGH;
14059
14073
case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm:
14060
14074
case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm:
14061
14075
return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2_lane,
14062
- ExtendLane , HalfTy, Ops, E, "fdot2_lane");
14076
+ ExtendLaneArg , HalfTy, Ops, E, "fdot2_lane");
14063
14077
case NEON::BI__builtin_neon_vdot_f32_mf8_fpm:
14064
14078
case NEON::BI__builtin_neon_vdotq_f32_mf8_fpm:
14065
14079
return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4, false,
14066
14080
FloatTy, Ops, E, "fdot4");
14067
14081
case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm:
14068
14082
case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm:
14069
- ExtendLane = true;
14083
+ ExtendLaneArg = true;
14070
14084
LLVM_FALLTHROUGH;
14071
14085
case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm:
14072
14086
case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm:
14073
14087
return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane,
14074
- ExtendLane, FloatTy, Ops, E, "fdot4_lane");
14088
+ ExtendLaneArg, FloatTy, Ops, E, "fdot4_lane");
14089
+
14090
+ case NEON::BI__builtin_neon_vmlalbq_f16_mf8_fpm:
14091
+ return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalb,
14092
+ {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
14093
+ "vmlal");
14094
+ case NEON::BI__builtin_neon_vmlaltq_f16_mf8_fpm:
14095
+ return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalt,
14096
+ {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
14097
+ "vmlal");
14098
+ case NEON::BI__builtin_neon_vmlallbbq_f32_mf8_fpm:
14099
+ return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlallbb,
14100
+ {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
14101
+ "vmlall");
14102
+ case NEON::BI__builtin_neon_vmlallbtq_f32_mf8_fpm:
14103
+ return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlallbt,
14104
+ {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
14105
+ "vmlall");
14106
+ case NEON::BI__builtin_neon_vmlalltbq_f32_mf8_fpm:
14107
+ return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalltb,
14108
+ {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
14109
+ "vmlall");
14110
+ case NEON::BI__builtin_neon_vmlallttq_f32_mf8_fpm:
14111
+ return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalltt,
14112
+ {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
14113
+ "vmlall");
14114
+ case NEON::BI__builtin_neon_vmlalbq_lane_f16_mf8_fpm:
14115
+ ExtendLaneArg = true;
14116
+ LLVM_FALLTHROUGH;
14117
+ case NEON::BI__builtin_neon_vmlalbq_laneq_f16_mf8_fpm:
14118
+ return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalb_lane,
14119
+ ExtendLaneArg, HalfTy, Ops, E, "vmlal_lane");
14120
+ case NEON::BI__builtin_neon_vmlaltq_lane_f16_mf8_fpm:
14121
+ ExtendLaneArg = true;
14122
+ LLVM_FALLTHROUGH;
14123
+ case NEON::BI__builtin_neon_vmlaltq_laneq_f16_mf8_fpm:
14124
+ return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalt_lane,
14125
+ ExtendLaneArg, HalfTy, Ops, E, "vmlal_lane");
14126
+ case NEON::BI__builtin_neon_vmlallbbq_lane_f32_mf8_fpm:
14127
+ ExtendLaneArg = true;
14128
+ LLVM_FALLTHROUGH;
14129
+ case NEON::BI__builtin_neon_vmlallbbq_laneq_f32_mf8_fpm:
14130
+ return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlallbb_lane,
14131
+ ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane");
14132
+ case NEON::BI__builtin_neon_vmlallbtq_lane_f32_mf8_fpm:
14133
+ ExtendLaneArg = true;
14134
+ LLVM_FALLTHROUGH;
14135
+ case NEON::BI__builtin_neon_vmlallbtq_laneq_f32_mf8_fpm:
14136
+ return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlallbt_lane,
14137
+ ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane");
14138
+ case NEON::BI__builtin_neon_vmlalltbq_lane_f32_mf8_fpm:
14139
+ ExtendLaneArg = true;
14140
+ LLVM_FALLTHROUGH;
14141
+ case NEON::BI__builtin_neon_vmlalltbq_laneq_f32_mf8_fpm:
14142
+ return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalltb_lane,
14143
+ ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane");
14144
+ case NEON::BI__builtin_neon_vmlallttq_lane_f32_mf8_fpm:
14145
+ ExtendLaneArg = true;
14146
+ LLVM_FALLTHROUGH;
14147
+ case NEON::BI__builtin_neon_vmlallttq_laneq_f32_mf8_fpm:
14148
+ return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalltt_lane,
14149
+ ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane");
14075
14150
case NEON::BI__builtin_neon_vamin_f16:
14076
14151
case NEON::BI__builtin_neon_vaminq_f16:
14077
14152
case NEON::BI__builtin_neon_vamin_f32:
0 commit comments