2121#include " llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
2222#include " llvm/CodeGen/GlobalISel/Utils.h"
2323#include " llvm/CodeGen/MachineInstr.h"
24+ #include " llvm/CodeGen/MachineInstrBuilder.h"
2425#include " llvm/CodeGen/MachineRegisterInfo.h"
2526#include " llvm/CodeGen/TargetOpcodes.h"
2627#include " llvm/IR/DerivedTypes.h"
@@ -820,8 +821,17 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
820821 .legalFor (
821822 {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
822823 .libcallFor ({{s16, s128}, {s32, s128}, {s64, s128}})
823- .clampNumElements (0 , v4s16, v4s16)
824- .clampNumElements (0 , v2s32, v2s32)
824+ .moreElementsToNextPow2 (1 )
825+ .customIf ([](const LegalityQuery &Q) {
826+ LLT DstTy = Q.Types [0 ];
827+ LLT SrcTy = Q.Types [1 ];
828+ return SrcTy.isFixedVector () && DstTy.isFixedVector () &&
829+ SrcTy.getScalarSizeInBits () == 64 &&
830+ DstTy.getScalarSizeInBits () == 16 ;
831+ })
832+ // Clamp based on input
833+ .clampNumElements (1 , v4s32, v4s32)
834+ .clampNumElements (1 , v2s64, v2s64)
825835 .scalarize (0 );
826836
827837 getActionDefinitionsBuilder (G_FPEXT)
@@ -1479,6 +1489,10 @@ bool AArch64LegalizerInfo::legalizeCustom(
14791489 return legalizeICMP (MI, MRI, MIRBuilder);
14801490 case TargetOpcode::G_BITCAST:
14811491 return legalizeBitcast (MI, Helper);
1492+ case TargetOpcode::G_FPTRUNC:
1493+ // In order to lower f16 to f64 properly, we need to use f32 as an
1494+ // intermediary
1495+ return legalizeFptrunc (MI, MIRBuilder, MRI);
14821496 }
14831497
14841498 llvm_unreachable (" expected switch to return" );
@@ -2416,3 +2430,80 @@ bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI,
24162430 MI.eraseFromParent ();
24172431 return true ;
24182432}
2433+
2434+ bool AArch64LegalizerInfo::legalizeFptrunc (MachineInstr &MI,
2435+ MachineIRBuilder &MIRBuilder,
2436+ MachineRegisterInfo &MRI) const {
2437+ auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs ();
2438+ assert (SrcTy.isFixedVector () && isPowerOf2_32 (SrcTy.getNumElements ()) &&
2439+ " Expected a power of 2 elements" );
2440+
2441+ LLT s16 = LLT::scalar (16 );
2442+ LLT s32 = LLT::scalar (32 );
2443+ LLT s64 = LLT::scalar (64 );
2444+ LLT v2s16 = LLT::fixed_vector (2 , s16);
2445+ LLT v4s16 = LLT::fixed_vector (4 , s16);
2446+ LLT v2s32 = LLT::fixed_vector (2 , s32);
2447+ LLT v4s32 = LLT::fixed_vector (4 , s32);
2448+ LLT v2s64 = LLT::fixed_vector (2 , s64);
2449+
2450+ SmallVector<Register> RegsToUnmergeTo;
2451+ SmallVector<Register> TruncOddDstRegs;
2452+ SmallVector<Register> RegsToMerge;
2453+
2454+ unsigned ElemCount = SrcTy.getNumElements ();
2455+
2456+ // Find the biggest size chunks we can work with
2457+ int StepSize = ElemCount % 4 ? 2 : 4 ;
2458+
2459+ // If we have a power of 2 greater than 2, we need to first unmerge into
2460+ // enough pieces
2461+ if (ElemCount <= 2 )
2462+ RegsToUnmergeTo.push_back (Src);
2463+ else {
2464+ for (unsigned i = 0 ; i < ElemCount / 2 ; ++i)
2465+ RegsToUnmergeTo.push_back (MRI.createGenericVirtualRegister (v2s64));
2466+
2467+ MIRBuilder.buildUnmerge (RegsToUnmergeTo, Src);
2468+ }
2469+
2470+ // Create all of the round-to-odd instructions and store them
2471+ for (auto SrcReg : RegsToUnmergeTo) {
2472+ Register Mid =
2473+ MIRBuilder.buildInstr (AArch64::G_FPTRUNC_ODD, {v2s32}, {SrcReg})
2474+ .getReg (0 );
2475+ TruncOddDstRegs.push_back (Mid);
2476+ }
2477+
2478+ // Truncate 4s32 to 4s16 if we can to reduce instruction count, otherwise
2479+ // truncate 2s32 to 2s16.
2480+ unsigned Index = 0 ;
2481+ for (unsigned LoopIter = 0 ; LoopIter < ElemCount / StepSize; ++LoopIter) {
2482+ if (StepSize == 4 ) {
2483+ Register ConcatDst =
2484+ MIRBuilder
2485+ .buildMergeLikeInstr (
2486+ {v4s32}, {TruncOddDstRegs[Index++], TruncOddDstRegs[Index++]})
2487+ .getReg (0 );
2488+
2489+ RegsToMerge.push_back (
2490+ MIRBuilder.buildFPTrunc (v4s16, ConcatDst).getReg (0 ));
2491+ } else {
2492+ RegsToMerge.push_back (
2493+ MIRBuilder.buildFPTrunc (v2s16, TruncOddDstRegs[Index++]).getReg (0 ));
2494+ }
2495+ }
2496+
2497+ // If there is only one register, replace the destination
2498+ if (RegsToMerge.size () == 1 ) {
2499+ MRI.replaceRegWith (Dst, RegsToMerge.pop_back_val ());
2500+ MI.eraseFromParent ();
2501+ return true ;
2502+ }
2503+
2504+ // Merge the rest of the instructions & replace the register
2505+ Register Fin = MIRBuilder.buildMergeLikeInstr (DstTy, RegsToMerge).getReg (0 );
2506+ MRI.replaceRegWith (Dst, Fin);
2507+ MI.eraseFromParent ();
2508+ return true ;
2509+ }
0 commit comments