@@ -819,7 +819,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
819819 {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
820820 .libcallFor ({{s16, s128}, {s32, s128}, {s64, s128}})
821821 .moreElementsToNextPow2 (1 )
822- .lowerIf ([](const LegalityQuery &Q) {
822+ .customIf ([](const LegalityQuery &Q) {
823823 LLT DstTy = Q.Types [0 ];
824824 LLT SrcTy = Q.Types [1 ];
825825 return SrcTy.isFixedVector () && DstTy.isFixedVector () &&
@@ -1474,6 +1474,10 @@ bool AArch64LegalizerInfo::legalizeCustom(
14741474 return legalizeICMP (MI, MRI, MIRBuilder);
14751475 case TargetOpcode::G_BITCAST:
14761476 return legalizeBitcast (MI, Helper);
1477+ case TargetOpcode::G_FPTRUNC:
1478+ // In order to vectorise f16 to f64 properly, we need to use f32 as an
1479+ // intermediary
1480+ return legalizeFptrunc (MI, MIRBuilder, MRI);
14771481 }
14781482
14791483 llvm_unreachable (" expected switch to return" );
@@ -2400,3 +2404,80 @@ bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI,
24002404 MI.eraseFromParent ();
24012405 return true ;
24022406}
2407+
2408+ bool AArch64LegalizerInfo::legalizeFptrunc (MachineInstr &MI,
2409+ MachineIRBuilder &MIRBuilder,
2410+ MachineRegisterInfo &MRI) const {
2411+ auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs ();
2412+ assert (SrcTy.isFixedVector () && isPowerOf2_32 (SrcTy.getNumElements ()) &&
2413+ " Expected a power of 2 elements" );
2414+
2415+ LLT s16 = LLT::scalar (16 );
2416+ LLT s32 = LLT::scalar (32 );
2417+ LLT s64 = LLT::scalar (64 );
2418+ LLT v2s16 = LLT::fixed_vector (2 , s16);
2419+ LLT v4s16 = LLT::fixed_vector (4 , s16);
2420+ LLT v2s32 = LLT::fixed_vector (2 , s32);
2421+ LLT v4s32 = LLT::fixed_vector (4 , s32);
2422+ LLT v2s64 = LLT::fixed_vector (2 , s64);
2423+
2424+ SmallVector<Register> RegsToUnmergeTo;
2425+ SmallVector<Register> TruncOddDstRegs;
2426+ SmallVector<Register> RegsToMerge;
2427+
2428+ unsigned ElemCount = SrcTy.getNumElements ();
2429+
2430+ // Find the biggest size chunks we can work with
2431+ int StepSize = ElemCount % 4 ? 2 : 4 ;
2432+
2433+ // If we have a power of 2 greater than 2, we need to first unmerge into
2434+ // enough pieces
2435+ if (ElemCount <= 2 )
2436+ RegsToUnmergeTo.push_back (Src);
2437+ else {
2438+ for (unsigned i = 0 ; i < ElemCount / 2 ; ++i) {
2439+ RegsToUnmergeTo.push_back (MRI.createGenericVirtualRegister (v2s64));
2440+ }
2441+
2442+ MIRBuilder.buildUnmerge (RegsToUnmergeTo, Src);
2443+ }
2444+
2445+ // Create all of the round-to-odd instructions and store them
2446+ for (auto SrcReg : RegsToUnmergeTo) {
2447+ Register Mid =
2448+ MIRBuilder.buildInstr (AArch64::G_FPTRUNC_ODD, {v2s32}, {SrcReg})
2449+ .getReg (0 );
2450+ TruncOddDstRegs.push_back (Mid);
2451+ }
2452+
2453+ // Truncate 4s32 to 4s16 if we can to reduce instruction count, otherwise truncate 2s32 to 2s16.
2454+ unsigned Index = 0 ;
2455+ for (unsigned LoopIter = 0 ; LoopIter < ElemCount / StepSize; ++LoopIter) {
2456+ if (StepSize == 4 ) {
2457+ Register ConcatDst =
2458+ MIRBuilder
2459+ .buildMergeLikeInstr (
2460+ {v4s32}, {TruncOddDstRegs[Index++], TruncOddDstRegs[Index++]})
2461+ .getReg (0 );
2462+
2463+ RegsToMerge.push_back (
2464+ MIRBuilder.buildFPTrunc (v4s16, ConcatDst).getReg (0 ));
2465+ } else {
2466+ RegsToMerge.push_back (
2467+ MIRBuilder.buildFPTrunc (v2s16, TruncOddDstRegs[Index++]).getReg (0 ));
2468+ }
2469+ }
2470+
2471+ // If there is only one register, replace the destination
2472+ if (RegsToMerge.size () == 1 ) {
2473+ MRI.replaceRegWith (Dst, RegsToMerge.pop_back_val ());
2474+ MI.eraseFromParent ();
2475+ return true ;
2476+ }
2477+
2478+ // Merge the rest of the instructions & replace the register
2479+ Register Fin = MIRBuilder.buildMergeLikeInstr (DstTy, RegsToMerge).getReg (0 );
2480+ MRI.replaceRegWith (Dst, Fin);
2481+ MI.eraseFromParent ();
2482+ return true ;
2483+ }
0 commit comments