Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -3464,6 +3464,33 @@ class TargetLoweringBase {
return false;
}

// Get the preferred opcode for FP_TO_XINT nodes.
// By default, this checks if the provded operation is an illegal FP_TO_UINT
// and if so, checks if FP_TO_SINT is legal or custom for use as a
// replacement. If both UINT and SINT conversions are Custom, we choose SINT
// by default because that's the right thing on PPC.
virtual unsigned getFPToXIntOpcode(unsigned Op, EVT FromVT, EVT ToVT) const {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Name could use work. getPreferredFPToIntOpcode?

if (isOperationLegal(Op, ToVT))
return Op;
switch (Op) {
case ISD::FP_TO_UINT:
if (isOperationLegalOrCustom(ISD::FP_TO_SINT, ToVT))
return ISD::FP_TO_SINT;
break;
case ISD::STRICT_FP_TO_UINT:
if (isOperationLegalOrCustom(ISD::STRICT_FP_TO_SINT, ToVT))
return ISD::STRICT_FP_TO_SINT;
break;
case ISD::VP_FP_TO_UINT:
if (isOperationLegalOrCustom(ISD::VP_FP_TO_SINT, ToVT))
return ISD::VP_FP_TO_SINT;
break;
default:
break;
}
return Op;
}

/// Create the IR node for the given complex deinterleaving operation.
/// If one cannot be created using all the given inputs, nullptr should be
/// returned.
Expand Down
22 changes: 2 additions & 20 deletions llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -848,28 +848,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N) {

SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
unsigned NewOpc = N->getOpcode();
unsigned NewOpc =
TLI.getFPToXIntOpcode(N->getOpcode(), N->getValueType(0), NVT);
SDLoc dl(N);

// If we're promoting a UINT to a larger size and the larger FP_TO_UINT is
// not Legal, check to see if we can use FP_TO_SINT instead. (If both UINT
// and SINT conversions are Custom, there is no way to tell which is
// preferable. We choose SINT because that's the right thing on PPC.)
if (N->getOpcode() == ISD::FP_TO_UINT &&
!TLI.isOperationLegal(ISD::FP_TO_UINT, NVT) &&
TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NVT))
NewOpc = ISD::FP_TO_SINT;

if (N->getOpcode() == ISD::STRICT_FP_TO_UINT &&
!TLI.isOperationLegal(ISD::STRICT_FP_TO_UINT, NVT) &&
TLI.isOperationLegalOrCustom(ISD::STRICT_FP_TO_SINT, NVT))
NewOpc = ISD::STRICT_FP_TO_SINT;

if (N->getOpcode() == ISD::VP_FP_TO_UINT &&
!TLI.isOperationLegal(ISD::VP_FP_TO_UINT, NVT) &&
TLI.isOperationLegalOrCustom(ISD::VP_FP_TO_SINT, NVT))
NewOpc = ISD::VP_FP_TO_SINT;

SDValue Res;
if (N->isStrictFPOpcode()) {
Res = DAG.getNode(NewOpc, dl, {NVT, MVT::Other},
Expand Down
27 changes: 27 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6179,6 +6179,33 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
return nullptr;
}

// Rather than default to SINT when both UINT and SINT are custom, we only
// change the opcode when UINT is not legal and SINT is. UINT is preferred when
// both are custom since unsigned CVT instructions can lead to slightly better
// SASS code with fewer instructions.
unsigned NVPTXTargetLowering::getFPToXIntOpcode(unsigned Op, EVT FromVT,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note - This seems like it will always return Op since I'm unaware of cases in NVPTX where a UINT conversion would be illegal and a SINT conversion would be legal, but wasn't quite sure.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to override it for NVPTX then? It sounds like the default implementation would do the job. Reading the original comment in PromoteIntRes_FP_TO_XINT it may be PPC that would need the override.

Given that we did end up with the signed conversion, I suspect NVPTX's problem is not whether we want to pick signed or unsigned conversion, but that we need to extend the set of supported conversions to cover "{f16,f32,f64} <-> u8", possibly to other integer types, too. Then we'd always end up with signedness-matching conversion instruction. Right now we only define them for i1, i16, i32 and i64, but not i8.

for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
setOperationAction(
{ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
VT, Custom);
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this current iteration we would still need it as the default hook is just implementing the hook-less behavior that was in PromoteIntRes_FP_TO_XINT. I feel like the default hook should be what NVPTX has (keep opcode when both are custom) and targets like PPC should override it, but I don't know what other targets require so my opinion isn't strong.

Your point about supported conversions seems good though, so maybe we don't need any kind of hook change for the NVPTX case. I'll look into it more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that because the i8 needs to be promoted, using custom requires that we support these operations in ReplaceNodeResults. Doing that seems like re-implementing PromoteIntRes_FP_TO_XINT, so I'm not sure it's worth it if we can use the hook instead. Thoughts?

Otherwise, the flow looks like:
PromoteIntegerResult->CustomLowerNode->ReplaceNodeResults->"LLVM ERROR: Unhandled custom legalization"

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have a strong preference either way. The patch does improve things for NVPTX, however marginally.

EVT ToVT) const {
if (isOperationLegal(Op, ToVT))
return Op;
switch (Op) {
case ISD::FP_TO_UINT:
if (isOperationLegal(ISD::FP_TO_SINT, ToVT))
return ISD::FP_TO_SINT;
break;
case ISD::STRICT_FP_TO_UINT:
if (isOperationLegal(ISD::STRICT_FP_TO_SINT, ToVT))
return ISD::STRICT_FP_TO_SINT;
break;
case ISD::VP_FP_TO_UINT:
if (isOperationLegal(ISD::VP_FP_TO_SINT, ToVT))
return ISD::VP_FP_TO_SINT;
break;
default:
break;
}
return Op;
}

// Pin NVPTXTargetObjectFile's vtables to this file.
NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,8 @@ class NVPTXTargetLowering : public TargetLowering {
Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;

unsigned getFPToXIntOpcode(unsigned Op, EVT FromVT, EVT ToVT) const override;

private:
const NVPTXSubtarget &STI; // cache the subtarget here
mutable unsigned GlobalUniqueCallSite;
Expand Down
134 changes: 134 additions & 0 deletions llvm/test/CodeGen/NVPTX/convert-fp-i8.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does this change buy us?
It appears that on this test LLVM currently generates the same output (AFAICT): https://godbolt.org/z/svM1Kns79

It's possible that I've missed something. Can you point me at the suboptimal code LLVM generates now?
Perhaps we just need a few more test cases to cover the changes introduced by the patch..

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first two test cases should have different PTX with the change.
If we look at line 21 in the PTX of that godbolt link:
cvt.rzi.s16.f32 %rs1, %f1;
Compare that to the expected output in the test, line 10:
; CHECK-NEXT: cvt.rzi.u16.f32 %rs1, %f1;
The difference is the s16 vs u16. Current LLVM is generating a signed cvt instruction for what should be an unsigned value.

You can see the SASS differences here: https://godbolt.org/z/GjqYzejPz

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it matter in practice?

The change affects only conversions to u8 and given that we always actually convert to the 16-bit integer, for the valid range of inputs [0f .. 255f], the low 8 bits of the result will be correct regardless of whether we used signed or unsigned conversion. For the inputs outside of the valid input range LLVM IR considers results to be poison.

If the goal is to have upper bits of the result always be 0, even for out of range inputs, then this patch is insufficient. It will still fill in upper bits for some out-of range input values, because we're converting to a 16-bit int. E.g converting 65535.0 will give us 0xffff. If all-zero MSBs are indeed the goal here, we'd need to explicitly mask the low 8 bits after conversion, instead. Considering that this code has been in place in the current shape for a very long time, I do not think this is necessary in practice.

Perhaps I'm still missing something. Can you elaborate on what motivates this change and what exactly is the issue it is intended to solve. Simply using PTX instruction with a better matching name but no effect on the valid inputs is not worth plumbing special case into generic LLVM code. IMO.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I generally agree with your reasoning as to why this seems irrelevant on the PTX level. However...

Does it matter in practice?

Can you provide your opinion on the SASS diff I linked?
Without change:

cvt_u8_f32:
 F2I.S16.TRUNC.NTZ R4, R4 
 LOP3.LUT R4, R4, 0xffff, RZ, 0xc0, !PT 
 RET.ABS.NODEC R20 0x0

With change:

cvt_u8_f32:
 F2I.U16.TRUNC.NTZ R4, R4 
 RET.ABS.NODEC R20 0x0

It's definitely something of a corner case, but it does seem better. The biggest thing for me is that this implies that the signed/unsigned distinction is relevant for ptxas and is it not just a better name.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you provide your opinion on the SASS diff I linked?

There's obviously one less instruction. However, considering that float to u8 conversion happens approximately never, and that even when it does, it would be dwarfed by whatever operations we perform in FP, I'd be really surprised if you were to see any measurable differences on anything other than an artificial benchmark targeting just this instruction.

On the other hand, it adds non-zero complexity to LLVM which will need to be maintained ~forever. Considering near-zero benefit vs non-zero maintenance cost, I'd say it's not worth it. The impact is minor either way, so if the change could be plausibly used for other targets, I'd be fine with it.

@arsenm -- would AMDGPU benefit from the distinction in signedness in FP-to-int conversions?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the cases that require larger expansions, unsigned is always preferable

; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 | FileCheck %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 | FileCheck %s

define i8 @cvt_u8_f32(float %x) {
; CHECK-LABEL: cvt_u8_f32(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .f32 %f<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.f32 %f1, [cvt_u8_f32_param_0];
; CHECK-NEXT: cvt.rzi.u16.f32 %rs1, %f1;
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%a = fptoui float %x to i8
ret i8 %a
}

define i8 @cvt_u8_f64(double %x) {
; CHECK-LABEL: cvt_u8_f64(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .f64 %fd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.f64 %fd1, [cvt_u8_f64_param_0];
; CHECK-NEXT: cvt.rzi.u16.f64 %rs1, %fd1;
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%a = fptoui double %x to i8
ret i8 %a
}

define float @cvt_f32_i8(i8 %x) {
; CHECK-LABEL: cvt_f32_i8(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .f32 %f<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u8 %rs1, [cvt_f32_i8_param_0];
; CHECK-NEXT: cvt.rn.f32.u16 %f1, %rs1;
; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
; CHECK-NEXT: ret;
%a = uitofp i8 %x to float
ret float %a
}

define double @cvt_f64_i8(i8 %x) {
; CHECK-LABEL: cvt_f64_i8(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .f64 %fd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u8 %rs1, [cvt_f64_i8_param_0];
; CHECK-NEXT: cvt.rn.f64.u16 %fd1, %rs1;
; CHECK-NEXT: st.param.f64 [func_retval0], %fd1;
; CHECK-NEXT: ret;
%a = uitofp i8 %x to double
ret double %a
}

define float @cvt_f32_s8(i8 %x) {
; CHECK-LABEL: cvt_f32_s8(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .f32 %f<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.s8 %rs1, [cvt_f32_s8_param_0];
; CHECK-NEXT: cvt.rn.f32.s16 %f1, %rs1;
; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
; CHECK-NEXT: ret;
%a = sitofp i8 %x to float
ret float %a
}

define double @cvt_f64_s8(i8 %x) {
; CHECK-LABEL: cvt_f64_s8(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .f64 %fd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.s8 %rs1, [cvt_f64_s8_param_0];
; CHECK-NEXT: cvt.rn.f64.s16 %fd1, %rs1;
; CHECK-NEXT: st.param.f64 [func_retval0], %fd1;
; CHECK-NEXT: ret;
%a = sitofp i8 %x to double
ret double %a
}

define i8 @cvt_s8_f32(float %x) {
; CHECK-LABEL: cvt_s8_f32(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f32 %f<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.f32 %f1, [cvt_s8_f32_param_0];
; CHECK-NEXT: cvt.rzi.s16.f32 %rs1, %f1;
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
; CHECK-NEXT: and.b32 %r2, %r1, 255;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%a = fptosi float %x to i8
ret i8 %a
}

define i8 @cvt_s8_f64(double %x) {
; CHECK-LABEL: cvt_s8_f64(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f64 %fd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.f64 %fd1, [cvt_s8_f64_param_0];
; CHECK-NEXT: cvt.rzi.s16.f64 %rs1, %fd1;
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
; CHECK-NEXT: and.b32 %r2, %r1, 255;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%a = fptosi double %x to i8
ret i8 %a
}