Skip to content
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
737470e
[AArch64][GlobalISel] Improve lowering of vector fp16 fptrunc and fpext
HolyMolyCowMan Oct 13, 2025
ae3ef1e
Address review comments
HolyMolyCowMan Oct 17, 2025
43b1509
Separate FPEXT & FPTRUNC changes
HolyMolyCowMan Oct 20, 2025
437caa3
Add new opcode for rounding to odd
HolyMolyCowMan Oct 21, 2025
6abe127
Disable combiner
HolyMolyCowMan Oct 21, 2025
13fc5dc
Use tablegen for matching
HolyMolyCowMan Oct 21, 2025
0ceacd7
Remove unused code
HolyMolyCowMan Oct 21, 2025
8b85744
Update tests
HolyMolyCowMan Oct 28, 2025
ec102fc
Move from a custom legalize function to lowering
HolyMolyCowMan Oct 29, 2025
411afc0
Linting
HolyMolyCowMan Oct 29, 2025
60b6da7
Update vocab & entities lists
HolyMolyCowMan Oct 29, 2025
76a03d6
Update reference_triplets.txt
HolyMolyCowMan Oct 29, 2025
a5635b7
Add trailing new line to reference_triplets.txt
HolyMolyCowMan Oct 29, 2025
5f97537
Add G_FPTRUNC_ODD to generic opcodes & check legality before emitting…
HolyMolyCowMan Oct 30, 2025
3671057
Undo removal of newline
HolyMolyCowMan Nov 3, 2025
39c3e04
Remove shared opcode & subsequent changes
HolyMolyCowMan Nov 26, 2025
0604176
Rework to make AArch64 specific & handle splitting into legal chunks
HolyMolyCowMan Nov 27, 2025
3513809
Merge remote-tracking branch 'origin/main' into fp16-fptrunc-fpext-lo…
HolyMolyCowMan Nov 27, 2025
74aa139
Linting
HolyMolyCowMan Nov 27, 2025
a1bf07a
Update tests
HolyMolyCowMan Nov 27, 2025
60cbbc7
Remove unnecessary brackets, revert a change & modify comment
HolyMolyCowMan Nov 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8510,7 +8510,7 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
assert(MRI.getType(Dst).getScalarType() == LLT::scalar(16) &&
MRI.getType(Src).getScalarType() == LLT::scalar(64));

if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
if (MRI.getType(Src).isVector())
return UnableToLegalize;

if (MI.getFlag(MachineInstr::FmAfn)) {
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrGISel.td
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,13 @@ def G_VLSHR : AArch64GenericInstruction {
let hasSideEffects = 0;
}

// Float truncation using round to odd
def G_FPTRUNC_ODD : AArch64GenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type1:$src);
let hasSideEffects = false;
}

// Represents an integer to FP conversion on the FPR bank.
def G_SITOF : AArch64GenericInstruction {
let OutOperandList = (outs type0:$dst);
Expand Down Expand Up @@ -297,6 +304,8 @@ def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;

def : GINodeEquiv<G_AARCH64_PREFETCH, AArch64Prefetch>;

def : GINodeEquiv<G_FPTRUNC_ODD, AArch64fcvtxn_n>;

// These are patterns that we only use for GlobalISel via the importer.
def : Pat<(f32 (fadd (vector_extract (v2f32 FPR64:$Rn), (i64 0)),
(vector_extract (v2f32 FPR64:$Rn), (i64 1)))),
Expand Down
96 changes: 94 additions & 2 deletions llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/IR/DerivedTypes.h"
Expand Down Expand Up @@ -820,8 +821,17 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.legalFor(
{{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
.libcallFor({{s16, s128}, {s32, s128}, {s64, s128}})
.clampNumElements(0, v4s16, v4s16)
.clampNumElements(0, v2s32, v2s32)
.moreElementsToNextPow2(1)
.customIf([](const LegalityQuery &Q) {
LLT DstTy = Q.Types[0];
LLT SrcTy = Q.Types[1];
return SrcTy.isFixedVector() && DstTy.isFixedVector() &&
SrcTy.getScalarSizeInBits() == 64 &&
DstTy.getScalarSizeInBits() == 16;
})
// Clamp based on input
.clampNumElements(1, v4s32, v4s32)
.clampNumElements(1, v2s64, v2s64)
.scalarize(0);

getActionDefinitionsBuilder(G_FPEXT)
Expand Down Expand Up @@ -1479,6 +1489,10 @@ bool AArch64LegalizerInfo::legalizeCustom(
return legalizeICMP(MI, MRI, MIRBuilder);
case TargetOpcode::G_BITCAST:
return legalizeBitcast(MI, Helper);
case TargetOpcode::G_FPTRUNC:
// In order to vectorise f16 to f64 properly, we need to use f32 as an
// intermediary
return legalizeFptrunc(MI, MIRBuilder, MRI);
}

llvm_unreachable("expected switch to return");
Expand Down Expand Up @@ -2416,3 +2430,81 @@ bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI,
MI.eraseFromParent();
return true;
}

bool AArch64LegalizerInfo::legalizeFptrunc(MachineInstr &MI,
MachineIRBuilder &MIRBuilder,
MachineRegisterInfo &MRI) const {
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
assert(SrcTy.isFixedVector() && isPowerOf2_32(SrcTy.getNumElements()) &&
"Expected a power of 2 elements");
Comment on lines +2438 to +2439
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we make this work with a "multiple of 2", not a "power of 2"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can do but currently the legalizer widens the fptrunc src to the next power of 2, meaning we can keep this simple if we only expect powers of 2. Otherwise, we might have to pad vectors so we can later concat them.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh - that fine for now then. We should go through at some point and check non-power2 vector types.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should I add a todo comment?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No thats fine, we need to go through all of them I think.


LLT s16 = LLT::scalar(16);
LLT s32 = LLT::scalar(32);
LLT s64 = LLT::scalar(64);
LLT v2s16 = LLT::fixed_vector(2, s16);
LLT v4s16 = LLT::fixed_vector(4, s16);
LLT v2s32 = LLT::fixed_vector(2, s32);
LLT v4s32 = LLT::fixed_vector(4, s32);
LLT v2s64 = LLT::fixed_vector(2, s64);

SmallVector<Register> RegsToUnmergeTo;
SmallVector<Register> TruncOddDstRegs;
SmallVector<Register> RegsToMerge;

unsigned ElemCount = SrcTy.getNumElements();

// Find the biggest size chunks we can work with
int StepSize = ElemCount % 4 ? 2 : 4;

// If we have a power of 2 greater than 2, we need to first unmerge into
// enough pieces
if (ElemCount <= 2)
RegsToUnmergeTo.push_back(Src);
else {
for (unsigned i = 0; i < ElemCount / 2; ++i) {
RegsToUnmergeTo.push_back(MRI.createGenericVirtualRegister(v2s64));
}

MIRBuilder.buildUnmerge(RegsToUnmergeTo, Src);
}

// Create all of the round-to-odd instructions and store them
for (auto SrcReg : RegsToUnmergeTo) {
Register Mid =
MIRBuilder.buildInstr(AArch64::G_FPTRUNC_ODD, {v2s32}, {SrcReg})
.getReg(0);
TruncOddDstRegs.push_back(Mid);
}

// Truncate 4s32 to 4s16 if we can to reduce instruction count, otherwise
// truncate 2s32 to 2s16.
unsigned Index = 0;
for (unsigned LoopIter = 0; LoopIter < ElemCount / StepSize; ++LoopIter) {
if (StepSize == 4) {
Register ConcatDst =
MIRBuilder
.buildMergeLikeInstr(
{v4s32}, {TruncOddDstRegs[Index++], TruncOddDstRegs[Index++]})
.getReg(0);

RegsToMerge.push_back(
MIRBuilder.buildFPTrunc(v4s16, ConcatDst).getReg(0));
} else {
RegsToMerge.push_back(
MIRBuilder.buildFPTrunc(v2s16, TruncOddDstRegs[Index++]).getReg(0));
}
}

// If there is only one register, replace the destination
if (RegsToMerge.size() == 1) {
MRI.replaceRegWith(Dst, RegsToMerge.pop_back_val());
MI.eraseFromParent();
return true;
}

// Merge the rest of the instructions & replace the register
Register Fin = MIRBuilder.buildMergeLikeInstr(DstTy, RegsToMerge).getReg(0);
MRI.replaceRegWith(Dst, Fin);
MI.eraseFromParent();
return true;
}
2 changes: 2 additions & 0 deletions llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ class AArch64LegalizerInfo : public LegalizerInfo {
bool legalizeDynStackAlloc(MachineInstr &MI, LegalizerHelper &Helper) const;
bool legalizePrefetch(MachineInstr &MI, LegalizerHelper &Helper) const;
bool legalizeBitcast(MachineInstr &MI, LegalizerHelper &Helper) const;
bool legalizeFptrunc(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
MachineRegisterInfo &MRI) const;
const AArch64Subtarget *ST;
};
} // End llvm namespace.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -578,8 +578,8 @@
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: G_FPTRUNC (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
# DEBUG-NEXT: .. the first uncovered type index: 2, OK
# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: G_FPTOSI (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
Expand Down
24 changes: 8 additions & 16 deletions llvm/test/CodeGen/AArch64/arm64-fp128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1197,30 +1197,22 @@ define <2 x half> @vec_round_f16(<2 x fp128> %val) {
;
; CHECK-GI-LABEL: vec_round_f16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: sub sp, sp, #64
; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Spill
; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
; CHECK-GI-NEXT: sub sp, sp, #48
; CHECK-GI-NEXT: str x30, [sp, #32] // 8-byte Spill
; CHECK-GI-NEXT: .cfi_def_cfa_offset 48
; CHECK-GI-NEXT: .cfi_offset w30, -16
; CHECK-GI-NEXT: mov v2.d[0], x8
; CHECK-GI-NEXT: str q1, [sp] // 16-byte Spill
; CHECK-GI-NEXT: mov v2.d[1], x8
; CHECK-GI-NEXT: str q2, [sp, #32] // 16-byte Spill
; CHECK-GI-NEXT: bl __trunctfhf2
; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $q0
; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Spill
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Reload
; CHECK-GI-NEXT: bl __trunctfhf2
; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Reload
; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $q0
; CHECK-GI-NEXT: str q0, [sp] // 16-byte Spill
; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Reload
; CHECK-GI-NEXT: bl __trunctfhf2
; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Reload
; CHECK-GI-NEXT: bl __trunctfhf2
; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Reload
; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: add sp, sp, #64
; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Reload
; CHECK-GI-NEXT: mov v1.h[1], v0.h[0]
; CHECK-GI-NEXT: fmov d0, d1
; CHECK-GI-NEXT: add sp, sp, #48
; CHECK-GI-NEXT: ret
%dst = fptrunc <2 x fp128> %val to <2 x half>
ret <2 x half> %dst
Expand Down
47 changes: 6 additions & 41 deletions llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -170,47 +170,12 @@ define <4 x half> @s_to_h(<4 x float> %a) {
}

define <4 x half> @d_to_h(<4 x double> %a) {
; CHECK-CVT-SD-LABEL: d_to_h:
; CHECK-CVT-SD: // %bb.0:
; CHECK-CVT-SD-NEXT: fcvtxn v0.2s, v0.2d
; CHECK-CVT-SD-NEXT: fcvtxn2 v0.4s, v1.2d
; CHECK-CVT-SD-NEXT: fcvtn v0.4h, v0.4s
; CHECK-CVT-SD-NEXT: ret
;
; CHECK-FP16-SD-LABEL: d_to_h:
; CHECK-FP16-SD: // %bb.0:
; CHECK-FP16-SD-NEXT: fcvtxn v0.2s, v0.2d
; CHECK-FP16-SD-NEXT: fcvtxn2 v0.4s, v1.2d
; CHECK-FP16-SD-NEXT: fcvtn v0.4h, v0.4s
; CHECK-FP16-SD-NEXT: ret
;
; CHECK-CVT-GI-LABEL: d_to_h:
; CHECK-CVT-GI: // %bb.0:
; CHECK-CVT-GI-NEXT: mov d2, v0.d[1]
; CHECK-CVT-GI-NEXT: fcvt h0, d0
; CHECK-CVT-GI-NEXT: mov d3, v1.d[1]
; CHECK-CVT-GI-NEXT: fcvt h1, d1
; CHECK-CVT-GI-NEXT: fcvt h2, d2
; CHECK-CVT-GI-NEXT: mov v0.h[1], v2.h[0]
; CHECK-CVT-GI-NEXT: fcvt h2, d3
; CHECK-CVT-GI-NEXT: mov v0.h[2], v1.h[0]
; CHECK-CVT-GI-NEXT: mov v0.h[3], v2.h[0]
; CHECK-CVT-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-CVT-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: d_to_h:
; CHECK-FP16-GI: // %bb.0:
; CHECK-FP16-GI-NEXT: mov d2, v0.d[1]
; CHECK-FP16-GI-NEXT: fcvt h0, d0
; CHECK-FP16-GI-NEXT: mov d3, v1.d[1]
; CHECK-FP16-GI-NEXT: fcvt h1, d1
; CHECK-FP16-GI-NEXT: fcvt h2, d2
; CHECK-FP16-GI-NEXT: mov v0.h[1], v2.h[0]
; CHECK-FP16-GI-NEXT: fcvt h2, d3
; CHECK-FP16-GI-NEXT: mov v0.h[2], v1.h[0]
; CHECK-FP16-GI-NEXT: mov v0.h[3], v2.h[0]
; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-FP16-GI-NEXT: ret
; CHECK-LABEL: d_to_h:
; CHECK: // %bb.0:
; CHECK-NEXT: fcvtxn v0.2s, v0.2d
; CHECK-NEXT: fcvtxn2 v0.4s, v1.2d
; CHECK-NEXT: fcvtn v0.4h, v0.4s
; CHECK-NEXT: ret
%1 = fptrunc <4 x double> %a to <4 x half>
ret <4 x half> %1
}
Expand Down
74 changes: 9 additions & 65 deletions llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -176,71 +176,15 @@ define <8 x half> @s_to_h(<8 x float> %a) {
}

define <8 x half> @d_to_h(<8 x double> %a) {
; CHECK-CVT-SD-LABEL: d_to_h:
; CHECK-CVT-SD: // %bb.0:
; CHECK-CVT-SD-NEXT: fcvtxn v0.2s, v0.2d
; CHECK-CVT-SD-NEXT: fcvtxn v2.2s, v2.2d
; CHECK-CVT-SD-NEXT: fcvtxn2 v0.4s, v1.2d
; CHECK-CVT-SD-NEXT: fcvtxn2 v2.4s, v3.2d
; CHECK-CVT-SD-NEXT: fcvtn v0.4h, v0.4s
; CHECK-CVT-SD-NEXT: fcvtn2 v0.8h, v2.4s
; CHECK-CVT-SD-NEXT: ret
;
; CHECK-FP16-SD-LABEL: d_to_h:
; CHECK-FP16-SD: // %bb.0:
; CHECK-FP16-SD-NEXT: fcvtxn v0.2s, v0.2d
; CHECK-FP16-SD-NEXT: fcvtxn v2.2s, v2.2d
; CHECK-FP16-SD-NEXT: fcvtxn2 v0.4s, v1.2d
; CHECK-FP16-SD-NEXT: fcvtxn2 v2.4s, v3.2d
; CHECK-FP16-SD-NEXT: fcvtn v0.4h, v0.4s
; CHECK-FP16-SD-NEXT: fcvtn2 v0.8h, v2.4s
; CHECK-FP16-SD-NEXT: ret
;
; CHECK-CVT-GI-LABEL: d_to_h:
; CHECK-CVT-GI: // %bb.0:
; CHECK-CVT-GI-NEXT: mov d4, v0.d[1]
; CHECK-CVT-GI-NEXT: fcvt h0, d0
; CHECK-CVT-GI-NEXT: mov d5, v1.d[1]
; CHECK-CVT-GI-NEXT: fcvt h1, d1
; CHECK-CVT-GI-NEXT: fcvt h4, d4
; CHECK-CVT-GI-NEXT: mov v0.h[1], v4.h[0]
; CHECK-CVT-GI-NEXT: fcvt h4, d5
; CHECK-CVT-GI-NEXT: mov v0.h[2], v1.h[0]
; CHECK-CVT-GI-NEXT: mov d1, v2.d[1]
; CHECK-CVT-GI-NEXT: fcvt h2, d2
; CHECK-CVT-GI-NEXT: mov v0.h[3], v4.h[0]
; CHECK-CVT-GI-NEXT: fcvt h1, d1
; CHECK-CVT-GI-NEXT: mov v0.h[4], v2.h[0]
; CHECK-CVT-GI-NEXT: mov d2, v3.d[1]
; CHECK-CVT-GI-NEXT: fcvt h3, d3
; CHECK-CVT-GI-NEXT: mov v0.h[5], v1.h[0]
; CHECK-CVT-GI-NEXT: fcvt h1, d2
; CHECK-CVT-GI-NEXT: mov v0.h[6], v3.h[0]
; CHECK-CVT-GI-NEXT: mov v0.h[7], v1.h[0]
; CHECK-CVT-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: d_to_h:
; CHECK-FP16-GI: // %bb.0:
; CHECK-FP16-GI-NEXT: mov d4, v0.d[1]
; CHECK-FP16-GI-NEXT: fcvt h0, d0
; CHECK-FP16-GI-NEXT: mov d5, v1.d[1]
; CHECK-FP16-GI-NEXT: fcvt h1, d1
; CHECK-FP16-GI-NEXT: fcvt h4, d4
; CHECK-FP16-GI-NEXT: mov v0.h[1], v4.h[0]
; CHECK-FP16-GI-NEXT: fcvt h4, d5
; CHECK-FP16-GI-NEXT: mov v0.h[2], v1.h[0]
; CHECK-FP16-GI-NEXT: mov d1, v2.d[1]
; CHECK-FP16-GI-NEXT: fcvt h2, d2
; CHECK-FP16-GI-NEXT: mov v0.h[3], v4.h[0]
; CHECK-FP16-GI-NEXT: fcvt h1, d1
; CHECK-FP16-GI-NEXT: mov v0.h[4], v2.h[0]
; CHECK-FP16-GI-NEXT: mov d2, v3.d[1]
; CHECK-FP16-GI-NEXT: fcvt h3, d3
; CHECK-FP16-GI-NEXT: mov v0.h[5], v1.h[0]
; CHECK-FP16-GI-NEXT: fcvt h1, d2
; CHECK-FP16-GI-NEXT: mov v0.h[6], v3.h[0]
; CHECK-FP16-GI-NEXT: mov v0.h[7], v1.h[0]
; CHECK-FP16-GI-NEXT: ret
; CHECK-LABEL: d_to_h:
; CHECK: // %bb.0:
; CHECK-NEXT: fcvtxn v0.2s, v0.2d
; CHECK-NEXT: fcvtxn v2.2s, v2.2d
; CHECK-NEXT: fcvtxn2 v0.4s, v1.2d
; CHECK-NEXT: fcvtxn2 v2.4s, v3.2d
; CHECK-NEXT: fcvtn v0.4h, v0.4s
; CHECK-NEXT: fcvtn2 v0.8h, v2.4s
; CHECK-NEXT: ret
%1 = fptrunc <8 x double> %a to <8 x half>
ret <8 x half> %1
}
Expand Down
Loading